Skip to content

Commit e6d0587

Browse files
shawnding(丁晓坤)alamb
authored andcommitted
ARROW-10781:[Rust] [DataFusion] add the 'Statistics' interface in data source
Add the 'Statistics' interface in data source Closes #8866 from XiaokunDing/statistics_interface Authored-by: shawnding(丁晓坤) <shawnding@tencent.com> Signed-off-by: Andrew Lamb <andrew@nerdnetworks.org>
1 parent 8b6f827 commit e6d0587

File tree

5 files changed

+40
-0
lines changed

5 files changed

+40
-0
lines changed

rust/datafusion/src/datasource/csv.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ use std::any::Any;
3838
use std::string::String;
3939
use std::sync::Arc;
4040

41+
use crate::datasource::datasource::Statistics;
4142
use crate::datasource::TableProvider;
4243
use crate::error::{DataFusionError, Result};
4344
use crate::physical_plan::csv::CsvExec;
@@ -52,6 +53,7 @@ pub struct CsvFile {
5253
has_header: bool,
5354
delimiter: u8,
5455
file_extension: String,
56+
statistics: Option<Statistics>,
5557
}
5658

5759
impl CsvFile {
@@ -75,6 +77,7 @@ impl CsvFile {
7577
has_header: options.has_header,
7678
delimiter: options.delimiter,
7779
file_extension: String::from(options.file_extension),
80+
statistics: None,
7881
})
7982
}
8083
}
@@ -104,4 +107,8 @@ impl TableProvider for CsvFile {
104107
batch_size,
105108
)?))
106109
}
110+
111+
fn statistics(&self) -> Option<Statistics> {
112+
self.statistics.clone()
113+
}
107114
}

rust/datafusion/src/datasource/datasource.rs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,16 @@ use crate::arrow::datatypes::SchemaRef;
2424
use crate::error::Result;
2525
use crate::physical_plan::ExecutionPlan;
2626

27+
/// This table statistics are estimates.
28+
/// It can not be used directly in the precise compute
29+
#[derive(Clone)]
30+
pub struct Statistics {
31+
/// The number of table rows
32+
pub num_rows: usize,
33+
/// total byte of the table rows
34+
pub total_byte_size: usize,
35+
}
36+
2737
/// Source table
2838
pub trait TableProvider {
2939
/// Returns the table provider as [`Any`](std::any::Any) so that it can be
@@ -39,4 +49,8 @@ pub trait TableProvider {
3949
projection: &Option<Vec<usize>>,
4050
batch_size: usize,
4151
) -> Result<Arc<dyn ExecutionPlan>>;
52+
53+
/// Returns the table Statistics
54+
/// Statistics should be optional because not all data sources can provide statistics.
55+
fn statistics(&self) -> Option<Statistics>;
4256
}

rust/datafusion/src/datasource/memory.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ use std::sync::Arc;
2525
use arrow::datatypes::{Field, Schema, SchemaRef};
2626
use arrow::record_batch::RecordBatch;
2727

28+
use crate::datasource::datasource::Statistics;
2829
use crate::datasource::TableProvider;
2930
use crate::error::{DataFusionError, Result};
3031
use crate::physical_plan::common;
@@ -35,6 +36,7 @@ use crate::physical_plan::ExecutionPlan;
3536
pub struct MemTable {
3637
schema: SchemaRef,
3738
batches: Vec<Vec<RecordBatch>>,
39+
statistics: Option<Statistics>,
3840
}
3941

4042
impl MemTable {
@@ -48,6 +50,7 @@ impl MemTable {
4850
Ok(Self {
4951
schema,
5052
batches: partitions,
53+
statistics: None,
5154
})
5255
} else {
5356
Err(DataFusionError::Plan(
@@ -132,6 +135,10 @@ impl TableProvider for MemTable {
132135
projection.clone(),
133136
)?))
134137
}
138+
139+
fn statistics(&self) -> Option<Statistics> {
140+
self.statistics.clone()
141+
}
135142
}
136143

137144
#[cfg(test)]

rust/datafusion/src/datasource/parquet.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ use std::sync::Arc;
2323

2424
use arrow::datatypes::*;
2525

26+
use crate::datasource::datasource::Statistics;
2627
use crate::datasource::TableProvider;
2728
use crate::error::Result;
2829
use crate::physical_plan::parquet::ParquetExec;
@@ -32,6 +33,7 @@ use crate::physical_plan::ExecutionPlan;
3233
pub struct ParquetTable {
3334
path: String,
3435
schema: SchemaRef,
36+
statistics: Option<Statistics>,
3537
}
3638

3739
impl ParquetTable {
@@ -42,6 +44,7 @@ impl ParquetTable {
4244
Ok(Self {
4345
path: path.to_string(),
4446
schema,
47+
statistics: None,
4548
})
4649
}
4750
}
@@ -69,6 +72,10 @@ impl TableProvider for ParquetTable {
6972
batch_size,
7073
)?))
7174
}
75+
76+
fn statistics(&self) -> Option<Statistics> {
77+
self.statistics.clone()
78+
}
7279
}
7380

7481
#[cfg(test)]

rust/datafusion/tests/dataframe.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
2020
use arrow::error::Result as ArrowResult;
2121
use arrow::record_batch::RecordBatch;
2222

23+
use datafusion::datasource::datasource::Statistics;
2324
use datafusion::datasource::TableProvider;
2425
use datafusion::error::{DataFusionError, Result};
2526

@@ -145,6 +146,10 @@ impl TableProvider for CustomTableProvider {
145146
projection: projection.clone(),
146147
}))
147148
}
149+
150+
fn statistics(&self) -> Option<Statistics> {
151+
None
152+
}
148153
}
149154

150155
#[tokio::test]

0 commit comments

Comments
 (0)