Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
166 changes: 62 additions & 104 deletions Cargo.lock

Large diffs are not rendered by default.

16 changes: 8 additions & 8 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -90,20 +90,20 @@ ahash = { version = "0.8", default-features = false, features = [
"runtime-rng",
] }
apache-avro = { version = "0.20", default-features = false }
arrow = { version = "55.2.0", features = [
arrow = { version = "56.0.0", features = [
"prettyprint",
"chrono-tz",
] }
arrow-buffer = { version = "55.2.0", default-features = false }
arrow-flight = { version = "55.2.0", features = [
arrow-buffer = { version = "56.0.0", default-features = false }
arrow-flight = { version = "56.0.0", features = [
"flight-sql-experimental",
] }
arrow-ipc = { version = "55.2.0", default-features = false, features = [
arrow-ipc = { version = "56.0.0", default-features = false, features = [
"lz4",
] }
arrow-ord = { version = "55.2.0", default-features = false }
arrow-schema = { version = "55.2.0", default-features = false }
async-trait = "0.1.88"
arrow-ord = { version = "56.0.0", default-features = false }
arrow-schema = { version = "56.0.0", default-features = false }
async-trait = "0.1.89"
bigdecimal = "0.4.8"
bytes = "1.10"
chrono = { version = "0.4.41", default-features = false }
Expand Down Expand Up @@ -157,7 +157,7 @@ itertools = "0.14"
log = "^0.4"
object_store = { version = "0.12.3", default-features = false }
parking_lot = "0.12"
parquet = { version = "55.2.0", default-features = false, features = [
parquet = { version = "56.0.0", default-features = false, features = [
"arrow",
"async",
"object_store",
Expand Down
2 changes: 1 addition & 1 deletion datafusion-examples/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ serde_json = { workspace = true }
tempfile = { workspace = true }
test-utils = { path = "../test-utils" }
tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot"] }
tonic = "0.12.1"
tonic = "0.13.1"
tracing = { version = "0.1" }
tracing-subscriber = { version = "0.3" }
url = { workspace = true }
Expand Down
2 changes: 1 addition & 1 deletion datafusion-testing
Submodule datafusion-testing updated 84 files
+3 −3 data/sqlite/random/expr/slt_good_0.slt
+10 −10 data/sqlite/random/expr/slt_good_10.slt
+5 −3 data/sqlite/random/expr/slt_good_100.slt
+3 −3 data/sqlite/random/expr/slt_good_101.slt
+12 −12 data/sqlite/random/expr/slt_good_102.slt
+7 −7 data/sqlite/random/expr/slt_good_104.slt
+9 −9 data/sqlite/random/expr/slt_good_105.slt
+6 −6 data/sqlite/random/expr/slt_good_106.slt
+3 −3 data/sqlite/random/expr/slt_good_107.slt
+6 −6 data/sqlite/random/expr/slt_good_108.slt
+3 −3 data/sqlite/random/expr/slt_good_11.slt
+6 −6 data/sqlite/random/expr/slt_good_110.slt
+6 −6 data/sqlite/random/expr/slt_good_111.slt
+7 −7 data/sqlite/random/expr/slt_good_112.slt
+6 −6 data/sqlite/random/expr/slt_good_113.slt
+3 −3 data/sqlite/random/expr/slt_good_116.slt
+3 −3 data/sqlite/random/expr/slt_good_118.slt
+4 −4 data/sqlite/random/expr/slt_good_119.slt
+9 −7 data/sqlite/random/expr/slt_good_12.slt
+14 −12 data/sqlite/random/expr/slt_good_13.slt
+6 −6 data/sqlite/random/expr/slt_good_14.slt
+6 −6 data/sqlite/random/expr/slt_good_16.slt
+3 −3 data/sqlite/random/expr/slt_good_17.slt
+3 −3 data/sqlite/random/expr/slt_good_2.slt
+6 −6 data/sqlite/random/expr/slt_good_22.slt
+10 −6 data/sqlite/random/expr/slt_good_23.slt
+3 −3 data/sqlite/random/expr/slt_good_24.slt
+12 −13 data/sqlite/random/expr/slt_good_25.slt
+6 −6 data/sqlite/random/expr/slt_good_26.slt
+8 −6 data/sqlite/random/expr/slt_good_27.slt
+3 −3 data/sqlite/random/expr/slt_good_28.slt
+14 −12 data/sqlite/random/expr/slt_good_29.slt
+3 −3 data/sqlite/random/expr/slt_good_3.slt
+3 −3 data/sqlite/random/expr/slt_good_30.slt
+9 −10 data/sqlite/random/expr/slt_good_31.slt
+9 −9 data/sqlite/random/expr/slt_good_33.slt
+3 −3 data/sqlite/random/expr/slt_good_34.slt
+3 −3 data/sqlite/random/expr/slt_good_36.slt
+3 −3 data/sqlite/random/expr/slt_good_38.slt
+3 −3 data/sqlite/random/expr/slt_good_39.slt
+3 −3 data/sqlite/random/expr/slt_good_4.slt
+3 −3 data/sqlite/random/expr/slt_good_41.slt
+6 −6 data/sqlite/random/expr/slt_good_44.slt
+3 −3 data/sqlite/random/expr/slt_good_45.slt
+3 −3 data/sqlite/random/expr/slt_good_47.slt
+3 −3 data/sqlite/random/expr/slt_good_49.slt
+3 −3 data/sqlite/random/expr/slt_good_5.slt
+3 −3 data/sqlite/random/expr/slt_good_50.slt
+3 −3 data/sqlite/random/expr/slt_good_55.slt
+9 −9 data/sqlite/random/expr/slt_good_56.slt
+5 −3 data/sqlite/random/expr/slt_good_57.slt
+4 −4 data/sqlite/random/expr/slt_good_58.slt
+9 −9 data/sqlite/random/expr/slt_good_6.slt
+3 −3 data/sqlite/random/expr/slt_good_60.slt
+3 −3 data/sqlite/random/expr/slt_good_61.slt
+3 −3 data/sqlite/random/expr/slt_good_62.slt
+6 −6 data/sqlite/random/expr/slt_good_63.slt
+3 −3 data/sqlite/random/expr/slt_good_64.slt
+3 −3 data/sqlite/random/expr/slt_good_66.slt
+14 −12 data/sqlite/random/expr/slt_good_67.slt
+5 −3 data/sqlite/random/expr/slt_good_68.slt
+3 −3 data/sqlite/random/expr/slt_good_69.slt
+3 −3 data/sqlite/random/expr/slt_good_7.slt
+5 −3 data/sqlite/random/expr/slt_good_72.slt
+3 −3 data/sqlite/random/expr/slt_good_75.slt
+3 −3 data/sqlite/random/expr/slt_good_78.slt
+3 −3 data/sqlite/random/expr/slt_good_79.slt
+3 −3 data/sqlite/random/expr/slt_good_8.slt
+3 −3 data/sqlite/random/expr/slt_good_80.slt
+10 −10 data/sqlite/random/expr/slt_good_81.slt
+10 −10 data/sqlite/random/expr/slt_good_82.slt
+6 −6 data/sqlite/random/expr/slt_good_83.slt
+11 −12 data/sqlite/random/expr/slt_good_84.slt
+7 −7 data/sqlite/random/expr/slt_good_85.slt
+3 −3 data/sqlite/random/expr/slt_good_88.slt
+3 −3 data/sqlite/random/expr/slt_good_89.slt
+11 −12 data/sqlite/random/expr/slt_good_92.slt
+8 −6 data/sqlite/random/expr/slt_good_93.slt
+3 −3 data/sqlite/random/expr/slt_good_94.slt
+3 −3 data/sqlite/random/expr/slt_good_96.slt
+8 −6 data/sqlite/random/expr/slt_good_97.slt
+2 −2 data/sqlite/random/groupby/slt_good_12.slt
+4 −4 data/sqlite/random/groupby/slt_good_4.slt
+5 −5 data/sqlite/random/groupby/slt_good_7.slt
2 changes: 1 addition & 1 deletion datafusion/common/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ log = { workspace = true }
object_store = { workspace = true, optional = true }
parquet = { workspace = true, optional = true, default-features = true }
paste = "1.0.15"
pyo3 = { version = "0.24.2", optional = true }
pyo3 = { version = "0.25", optional = true }
recursive = { workspace = true, optional = true }
sqlparser = { workspace = true }
tokio = { workspace = true }
Expand Down
16 changes: 1 addition & 15 deletions datafusion/common/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -602,13 +602,6 @@ config_namespace! {
/// default parquet writer setting
pub statistics_enabled: Option<String>, transform = str::to_lowercase, default = Some("page".into())

/// (writing) Sets max statistics size for any column. If NULL, uses
/// default parquet writer setting
/// max_statistics_size is deprecated, currently it is not being used
// TODO: remove once deprecated
#[deprecated(since = "45.0.0", note = "Setting does not do anything")]
pub max_statistics_size: Option<usize>, default = Some(4096)

/// (writing) Target maximum number of rows in each row group (defaults to 1M
/// rows). Writing larger row groups requires more memory to write, but
/// can get better compression and be faster to read.
Expand All @@ -622,7 +615,7 @@ config_namespace! {

/// (writing) Sets statistics truncate length. If NULL, uses
/// default parquet writer setting
pub statistics_truncate_length: Option<usize>, default = None
pub statistics_truncate_length: Option<usize>, default = Some(64)

/// (writing) Sets best effort maximum number of rows in data page
pub data_page_row_count_limit: usize, default = 20_000
Expand Down Expand Up @@ -2141,13 +2134,6 @@ config_namespace_with_hashmap! {
/// Sets bloom filter number of distinct values. If NULL, uses
/// default parquet options
pub bloom_filter_ndv: Option<u64>, default = None

/// Sets max statistics size for the column path. If NULL, uses
/// default parquet options
/// max_statistics_size is deprecated, currently it is not being used
// TODO: remove once deprecated
#[deprecated(since = "45.0.0", note = "Setting does not do anything")]
pub max_statistics_size: Option<usize>, default = None
}
}

Expand Down
25 changes: 1 addition & 24 deletions datafusion/common/src/file_options/parquet_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ use parquet::{
metadata::KeyValue,
properties::{
EnabledStatistics, WriterProperties, WriterPropertiesBuilder, WriterVersion,
DEFAULT_MAX_STATISTICS_SIZE, DEFAULT_STATISTICS_ENABLED,
DEFAULT_STATISTICS_ENABLED,
},
},
schema::types::ColumnPath,
Expand Down Expand Up @@ -160,16 +160,6 @@ impl TryFrom<&TableParquetOptions> for WriterPropertiesBuilder {
builder =
builder.set_column_bloom_filter_ndv(path.clone(), bloom_filter_ndv);
}

// max_statistics_size is deprecated, currently it is not being used
// TODO: remove once deprecated
#[allow(deprecated)]
if let Some(max_statistics_size) = options.max_statistics_size {
builder = {
#[allow(deprecated)]
builder.set_column_max_statistics_size(path, max_statistics_size)
}
}
}

Ok(builder)
Expand Down Expand Up @@ -218,7 +208,6 @@ impl ParquetOptions {
dictionary_enabled,
dictionary_page_size_limit,
statistics_enabled,
max_statistics_size,
max_row_group_size,
created_by,
column_index_truncate_length,
Expand Down Expand Up @@ -264,13 +253,6 @@ impl ParquetOptions {
.set_data_page_row_count_limit(*data_page_row_count_limit)
.set_bloom_filter_enabled(*bloom_filter_on_write);

builder = {
#[allow(deprecated)]
builder.set_max_statistics_size(
max_statistics_size.unwrap_or(DEFAULT_MAX_STATISTICS_SIZE),
)
};

if let Some(bloom_filter_fpp) = bloom_filter_fpp {
builder = builder.set_bloom_filter_fpp(*bloom_filter_fpp);
};
Expand Down Expand Up @@ -463,12 +445,10 @@ mod tests {
fn column_options_with_non_defaults(
src_col_defaults: &ParquetOptions,
) -> ParquetColumnOptions {
#[allow(deprecated)] // max_statistics_size
ParquetColumnOptions {
compression: Some("zstd(22)".into()),
dictionary_enabled: src_col_defaults.dictionary_enabled.map(|v| !v),
statistics_enabled: Some("none".into()),
max_statistics_size: Some(72),
encoding: Some("RLE".into()),
bloom_filter_enabled: Some(true),
bloom_filter_fpp: Some(0.72),
Expand All @@ -493,7 +473,6 @@ mod tests {
dictionary_enabled: Some(!defaults.dictionary_enabled.unwrap_or(false)),
dictionary_page_size_limit: 42,
statistics_enabled: Some("chunk".into()),
max_statistics_size: Some(42),
max_row_group_size: 42,
created_by: "wordy".into(),
column_index_truncate_length: Some(42),
Expand Down Expand Up @@ -551,7 +530,6 @@ mod tests {
),
bloom_filter_fpp: bloom_filter_default_props.map(|p| p.fpp),
bloom_filter_ndv: bloom_filter_default_props.map(|p| p.ndv),
max_statistics_size: Some(props.max_statistics_size(&col)),
}
}

Expand Down Expand Up @@ -608,7 +586,6 @@ mod tests {
compression: default_col_props.compression,
dictionary_enabled: default_col_props.dictionary_enabled,
statistics_enabled: default_col_props.statistics_enabled,
max_statistics_size: default_col_props.max_statistics_size,
bloom_filter_on_write: default_col_props
.bloom_filter_enabled
.unwrap_or_default(),
Expand Down
19 changes: 14 additions & 5 deletions datafusion/common/src/scalar/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -904,11 +904,10 @@ pub fn dict_from_values<K: ArrowDictionaryKeyType>(
.map(|index| {
if values_array.is_valid(index) {
let native_index = K::Native::from_usize(index).ok_or_else(|| {
DataFusionError::Internal(format!(
"Can not create index of type {} from value {}",
K::DATA_TYPE,
index
))
_internal_datafusion_err!(
"Can not create index of type {} from value {index}",
K::DATA_TYPE
)
})?;
Ok(Some(native_index))
} else {
Expand Down Expand Up @@ -2203,6 +2202,16 @@ impl ScalarValue {
}

let array: ArrayRef = match &data_type {
DataType::Decimal32(_precision, _scale) => {
return _not_impl_err!(
"Decimal32 not supported in ScalarValue::iter_to_array"
);
}
DataType::Decimal64(_precision, _scale) => {
return _not_impl_err!(
"Decimal64 not supported in ScalarValue::iter_to_array"
);
}
DataType::Decimal128(precision, scale) => {
let decimal_array =
ScalarValue::iter_to_decimal_array(scalars, *precision, *scale)?;
Expand Down
5 changes: 4 additions & 1 deletion datafusion/common/src/types/native.rs
Original file line number Diff line number Diff line change
Expand Up @@ -407,7 +407,10 @@ impl From<DataType> for NativeType {
DataType::Union(union_fields, _) => {
Union(LogicalUnionFields::from(&union_fields))
}
DataType::Decimal128(p, s) | DataType::Decimal256(p, s) => Decimal(p, s),
DataType::Decimal32(p, s)
| DataType::Decimal64(p, s)
| DataType::Decimal128(p, s)
| DataType::Decimal256(p, s) => Decimal(p, s),
DataType::Map(field, _) => Map(Arc::new(field.as_ref().into())),
DataType::Dictionary(_, data_type) => data_type.as_ref().clone().into(),
DataType::RunEndEncoded(_, field) => field.data_type().clone().into(),
Expand Down
27 changes: 26 additions & 1 deletion datafusion/core/src/datasource/file_format/parquet.rs
Original file line number Diff line number Diff line change
Expand Up @@ -523,11 +523,23 @@ mod tests {
let dic_array = DictionaryArray::<Int32Type>::try_new(keys, Arc::new(values))?;
let c_dic: ArrayRef = Arc::new(dic_array);

let batch1 = RecordBatch::try_from_iter(vec![("c_dic", c_dic)])?;
// Data for column string_truncation: ["a".repeat(128), null, "b".repeat(128), null]
let string_truncation: ArrayRef = Arc::new(StringArray::from(vec![
Some("a".repeat(128)),
None,
Some("b".repeat(128)),
None,
]));

let batch1 = RecordBatch::try_from_iter(vec![
("c_dic", c_dic),
("string_truncation", string_truncation),
])?;

// Use store_parquet to write each batch to its own file
// . batch1 written into first file and includes:
// - column c_dic that has 4 rows with no null. Stats min and max of dictionary column is available.
// - column string_truncation that has 4 rows with 2 nulls. Stats min and max of string column is available but not exact.
let store = Arc::new(RequestCountingObjectStore::new(Arc::new(
LocalFileSystem::new(),
)));
Expand Down Expand Up @@ -563,6 +575,19 @@ mod tests {
Precision::Exact(Utf8(Some("a".into())))
);

// column string_truncation
let string_truncation_stats = &stats.column_statistics[1];

assert_eq!(string_truncation_stats.null_count, Precision::Exact(2));
assert_eq!(
string_truncation_stats.max_value,
Precision::Inexact(ScalarValue::Utf8View(Some("b".repeat(63) + "c")))
);
assert_eq!(
string_truncation_stats.min_value,
Precision::Inexact(ScalarValue::Utf8View(Some("a".repeat(64))))
);

Ok(())
}

Expand Down
11 changes: 3 additions & 8 deletions datafusion/core/tests/fuzz_cases/pruning.rs
Original file line number Diff line number Diff line change
Expand Up @@ -319,14 +319,9 @@ async fn write_parquet_file(
row_groups: Vec<Vec<String>>,
) -> Bytes {
let mut buf = BytesMut::new().writer();
let mut props = WriterProperties::builder();
if let Some(truncation_length) = truncation_length {
props = {
#[allow(deprecated)]
props.set_max_statistics_size(truncation_length)
}
}
props = props.set_statistics_enabled(EnabledStatistics::Chunk); // row group level
let props = WriterProperties::builder()
.set_statistics_enabled(EnabledStatistics::Chunk) // row group level
.set_statistics_truncate_length(truncation_length);
let props = props.build();
{
let mut writer =
Expand Down
4 changes: 2 additions & 2 deletions datafusion/core/tests/parquet/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -110,11 +110,11 @@ struct ContextWithParquet {

/// The output of running one of the test cases
struct TestOutput {
/// The input string
/// The input query SQL
sql: String,
/// Execution metrics for the Parquet Scan
parquet_metrics: MetricsSet,
/// number of rows in results
/// number of actual rows in results
result_rows: usize,
/// the contents of the input, as a string
pretty_input: String,
Expand Down
14 changes: 8 additions & 6 deletions datafusion/core/tests/parquet/row_group_pruning.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ struct RowGroupPruningTest {
expected_files_pruned_by_statistics: Option<usize>,
expected_row_group_matched_by_bloom_filter: Option<usize>,
expected_row_group_pruned_by_bloom_filter: Option<usize>,
expected_results: usize,
expected_rows: usize,
}
impl RowGroupPruningTest {
// Start building the test configuration
Expand All @@ -48,7 +48,7 @@ impl RowGroupPruningTest {
expected_files_pruned_by_statistics: None,
expected_row_group_matched_by_bloom_filter: None,
expected_row_group_pruned_by_bloom_filter: None,
expected_results: 0,
expected_rows: 0,
}
}

Expand Down Expand Up @@ -99,9 +99,9 @@ impl RowGroupPruningTest {
self
}

// Set the expected rows for the test
/// Set the number of expected rows from the output of this test
fn with_expected_rows(mut self, rows: usize) -> Self {
self.expected_results = rows;
self.expected_rows = rows;
self
}

Expand Down Expand Up @@ -145,8 +145,10 @@ impl RowGroupPruningTest {
);
assert_eq!(
output.result_rows,
self.expected_results,
"mismatched expected rows: {}",
self.expected_rows,
"Expected {} rows, got {}: {}",
output.result_rows,
self.expected_rows,
output.description(),
);
}
Expand Down
2 changes: 2 additions & 0 deletions datafusion/datasource-avro/src/avro_to_arrow/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,8 @@ fn default_field_name(dt: &DataType) -> &str {
| DataType::LargeListView(_) => {
unimplemented!("View support not implemented")
}
DataType::Decimal32(_, _) => "decimal",
DataType::Decimal64(_, _) => "decimal",
DataType::Decimal128(_, _) => "decimal",
DataType::Decimal256(_, _) => "decimal",
}
Expand Down
2 changes: 1 addition & 1 deletion datafusion/datasource-parquet/src/file_format.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@

use std::any::Any;
use std::cell::RefCell;
use std::fmt;
use std::fmt::Debug;
use std::ops::Range;
use std::rc::Rc;
use std::sync::Arc;
use std::{fmt, vec};

use arrow::array::RecordBatch;
use arrow::datatypes::{Fields, Schema, SchemaRef, TimeUnit};
Expand Down
2 changes: 2 additions & 0 deletions datafusion/expr/src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -814,6 +814,8 @@ pub fn can_hash(data_type: &DataType) -> bool {
DataType::Float16 => true,
DataType::Float32 => true,
DataType::Float64 => true,
DataType::Decimal32(_, _) => true,
DataType::Decimal64(_, _) => true,
DataType::Decimal128(_, _) => true,
DataType::Decimal256(_, _) => true,
DataType::Timestamp(_, _) => true,
Expand Down
Loading
Loading