Skip to content

Commit 53415ca

Browse files
removed encodings, added statistics
1 parent ca87b28 commit 53415ca

File tree

2 files changed

+97
-26
lines changed

2 files changed

+97
-26
lines changed

Diff for: src/catalog/column.rs

+96
Original file line numberDiff line numberDiff line change
@@ -26,24 +26,40 @@ use parquet::file::statistics::Statistics;
2626
pub struct BoolType {
2727
pub min: bool,
2828
pub max: bool,
29+
pub distinct_count: u64,
30+
pub null_count: u64,
31+
pub is_max_value_exact: bool,
32+
pub is_min_value_exact: bool,
2933
}
3034

3135
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
3236
pub struct Float64Type {
3337
pub min: f64,
3438
pub max: f64,
39+
pub distinct_count: u64,
40+
pub null_count: u64,
41+
pub is_max_value_exact: bool,
42+
pub is_min_value_exact: bool,
3543
}
3644

3745
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
3846
pub struct Int64Type {
3947
pub min: i64,
4048
pub max: i64,
49+
pub distinct_count: u64,
50+
pub null_count: u64,
51+
pub is_max_value_exact: bool,
52+
pub is_min_value_exact: bool,
4153
}
4254

4355
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
4456
pub struct Utf8Type {
4557
pub min: String,
4658
pub max: String,
59+
pub distinct_count: u64,
60+
pub null_count: u64,
61+
pub is_max_value_exact: bool,
62+
pub is_min_value_exact: bool,
4763
}
4864

4965
// Typed statistics are typed variant of statistics
@@ -64,24 +80,40 @@ impl TypedStatistics {
6480
TypedStatistics::Bool(BoolType {
6581
min: min(this.min, other.min),
6682
max: max(this.max, other.max),
83+
distinct_count: this.distinct_count + other.distinct_count,
84+
null_count: this.null_count + other.null_count,
85+
is_max_value_exact: this.is_max_value_exact && other.is_max_value_exact,
86+
is_min_value_exact: this.is_min_value_exact && other.is_min_value_exact,
6787
})
6888
}
6989
(TypedStatistics::Float(this), TypedStatistics::Float(other)) => {
7090
TypedStatistics::Float(Float64Type {
7191
min: this.min.min(other.min),
7292
max: this.max.max(other.max),
93+
distinct_count: this.distinct_count + other.distinct_count,
94+
null_count: this.null_count + other.null_count,
95+
is_max_value_exact: this.is_max_value_exact && other.is_max_value_exact,
96+
is_min_value_exact: this.is_min_value_exact && other.is_min_value_exact,
7397
})
7498
}
7599
(TypedStatistics::Int(this), TypedStatistics::Int(other)) => {
76100
TypedStatistics::Int(Int64Type {
77101
min: min(this.min, other.min),
78102
max: max(this.max, other.max),
103+
distinct_count: this.distinct_count + other.distinct_count,
104+
null_count: this.null_count + other.null_count,
105+
is_max_value_exact: this.is_max_value_exact && other.is_max_value_exact,
106+
is_min_value_exact: this.is_min_value_exact && other.is_min_value_exact,
79107
})
80108
}
81109
(TypedStatistics::String(this), TypedStatistics::String(other)) => {
82110
TypedStatistics::String(Utf8Type {
83111
min: min(this.min, other.min),
84112
max: max(this.max, other.max),
113+
distinct_count: this.distinct_count + other.distinct_count,
114+
null_count: this.null_count + other.null_count,
115+
is_max_value_exact: this.is_max_value_exact && other.is_max_value_exact,
116+
is_min_value_exact: this.is_min_value_exact && other.is_min_value_exact,
85117
})
86118
}
87119
_ => panic!("Cannot update wrong types"),
@@ -146,26 +178,74 @@ impl TryFrom<&Statistics> for TypedStatistics {
146178
Statistics::Boolean(stats) => TypedStatistics::Bool(BoolType {
147179
min: *stats.min_opt().expect("Boolean stats min not set"),
148180
max: *stats.max_opt().expect("Boolean stats max not set"),
181+
distinct_count: stats
182+
.distinct_count()
183+
.expect("Boolean stats distinct count not set"),
184+
null_count: stats
185+
.null_count_opt()
186+
.expect("Boolean stats null count not set"),
187+
is_max_value_exact: stats.max_is_exact(),
188+
is_min_value_exact: stats.min_is_exact(),
149189
}),
150190
Statistics::Int32(stats) => TypedStatistics::Int(Int64Type {
151191
min: *stats.min_opt().expect("Int32 stats min not set") as i64,
152192
max: *stats.max_opt().expect("Int32 stats max not set") as i64,
193+
distinct_count: stats
194+
.distinct_count()
195+
.expect("Boolean stats distinct count not set"),
196+
null_count: stats
197+
.null_count_opt()
198+
.expect("Boolean stats null count not set"),
199+
is_max_value_exact: stats.max_is_exact(),
200+
is_min_value_exact: stats.min_is_exact(),
153201
}),
154202
Statistics::Int64(stats) => TypedStatistics::Int(Int64Type {
155203
min: *stats.min_opt().expect("Int64 stats min not set"),
156204
max: *stats.max_opt().expect("Int64 stats max not set"),
205+
distinct_count: stats
206+
.distinct_count()
207+
.expect("Boolean stats distinct count not set"),
208+
null_count: stats
209+
.null_count_opt()
210+
.expect("Boolean stats null count not set"),
211+
is_max_value_exact: stats.max_is_exact(),
212+
is_min_value_exact: stats.min_is_exact(),
157213
}),
158214
Statistics::Int96(stats) => TypedStatistics::Int(Int64Type {
159215
min: stats.min_opt().expect("Int96 stats min not set").to_i64(),
160216
max: stats.max_opt().expect("Int96 stats max not set").to_i64(),
217+
distinct_count: stats
218+
.distinct_count()
219+
.expect("Boolean stats distinct count not set"),
220+
null_count: stats
221+
.null_count_opt()
222+
.expect("Boolean stats null count not set"),
223+
is_max_value_exact: stats.max_is_exact(),
224+
is_min_value_exact: stats.min_is_exact(),
161225
}),
162226
Statistics::Float(stats) => TypedStatistics::Float(Float64Type {
163227
min: *stats.min_opt().expect("Float32 stats min not set") as f64,
164228
max: *stats.max_opt().expect("Float32 stats max not set") as f64,
229+
distinct_count: stats
230+
.distinct_count()
231+
.expect("Boolean stats distinct count not set"),
232+
null_count: stats
233+
.null_count_opt()
234+
.expect("Boolean stats null count not set"),
235+
is_max_value_exact: stats.max_is_exact(),
236+
is_min_value_exact: stats.min_is_exact(),
165237
}),
166238
Statistics::Double(stats) => TypedStatistics::Float(Float64Type {
167239
min: *stats.min_opt().expect("Float64 stats min not set"),
168240
max: *stats.max_opt().expect("Float64 stats max not set"),
241+
distinct_count: stats
242+
.distinct_count()
243+
.expect("Boolean stats distinct count not set"),
244+
null_count: stats
245+
.null_count_opt()
246+
.expect("Boolean stats null count not set"),
247+
is_max_value_exact: stats.max_is_exact(),
248+
is_min_value_exact: stats.min_is_exact(),
169249
}),
170250
Statistics::ByteArray(stats) => TypedStatistics::String(Utf8Type {
171251
min: stats
@@ -178,6 +258,14 @@ impl TryFrom<&Statistics> for TypedStatistics {
178258
.expect("Utf8 stats max not set")
179259
.as_utf8()?
180260
.to_owned(),
261+
distinct_count: stats
262+
.distinct_count()
263+
.expect("Boolean stats distinct count not set"),
264+
null_count: stats
265+
.null_count_opt()
266+
.expect("Boolean stats null count not set"),
267+
is_max_value_exact: stats.max_is_exact(),
268+
is_min_value_exact: stats.min_is_exact(),
181269
}),
182270
Statistics::FixedLenByteArray(stats) => TypedStatistics::String(Utf8Type {
183271
min: stats
@@ -190,6 +278,14 @@ impl TryFrom<&Statistics> for TypedStatistics {
190278
.expect("Utf8 stats max not set")
191279
.as_utf8()?
192280
.to_owned(),
281+
distinct_count: stats
282+
.distinct_count()
283+
.expect("Boolean stats distinct count not set"),
284+
null_count: stats
285+
.null_count_opt()
286+
.expect("Boolean stats null count not set"),
287+
is_max_value_exact: stats.max_is_exact(),
288+
is_min_value_exact: stats.min_is_exact(),
193289
}),
194290
};
195291

Diff for: src/storage/staging.rs

+1-26
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ use parquet::{
3737
arrow::ArrowWriter,
3838
basic::Encoding,
3939
errors::ParquetError,
40-
file::properties::{EnabledStatistics, WriterProperties, WriterPropertiesBuilder},
40+
file::properties::{WriterProperties, WriterPropertiesBuilder},
4141
format::SortingColumn,
4242
schema::types::ColumnPath,
4343
};
@@ -339,31 +339,6 @@ pub fn parquet_writer_props(
339339
sorting_column_vec.push(sorting_column);
340340
}
341341

342-
props = props
343-
.set_dictionary_enabled(true)
344-
.set_encoding(Encoding::PLAIN)
345-
.set_statistics_enabled(EnabledStatistics::Chunk);
346-
347-
let url_column = ColumnPath::new(vec!["URL".to_string()]);
348-
props = props
349-
.set_column_dictionary_enabled(url_column.clone(), true)
350-
.set_column_encoding(url_column.clone(), Encoding::DELTA_BYTE_ARRAY)
351-
.set_column_statistics_enabled(url_column.clone(), EnabledStatistics::Chunk);
352-
353-
let event_time_column = ColumnPath::new(vec!["EventTime".to_string()]);
354-
props = props
355-
.set_column_encoding(event_time_column.clone(), Encoding::DELTA_BINARY_PACKED)
356-
.set_column_statistics_enabled(event_time_column.clone(), EnabledStatistics::Chunk);
357-
358-
let user_id_column = ColumnPath::new(vec!["UserID".to_string()]);
359-
props = props
360-
.set_column_encoding(user_id_column.clone(), Encoding::DELTA_BINARY_PACKED)
361-
.set_column_statistics_enabled(user_id_column.clone(), EnabledStatistics::Chunk);
362-
363-
let search_phrase_column = ColumnPath::new(vec!["SearchPhrase".to_string()]);
364-
props = props
365-
.set_column_encoding(search_phrase_column.clone(), Encoding::DELTA_BYTE_ARRAY)
366-
.set_column_statistics_enabled(search_phrase_column.clone(), EnabledStatistics::Chunk);
367342
props = props.set_sorting_columns(Some(sorting_column_vec));
368343
props
369344
}

0 commit comments

Comments
 (0)