-
Notifications
You must be signed in to change notification settings - Fork 222
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add encoder utilities for pushdown (#2388)
This adds a new field encoder (ZoneMapFieldEncoder) that calculates pushdown statistics and places them in the metadata. It also changes the encoder so that it the choice of encoding is configurable. This makes it possible for extensions to register custom encodings. The zone maps encoder is an example of this as it is placed in a special crate for "encodings that rely on datafusion". It also adds some utilities for converting an `EncodedBatch` to `Bytes` according to the lance file format. This makes it possible to go from `RecordBatch` to `Bytes` using the lance file format. There is not much testing for the zone maps encoder. More will come when we add support for reading zone maps but I want to keep this PR simple for now.
- Loading branch information
1 parent
35c066a
commit d8da445
Showing
19 changed files
with
791 additions
and
160 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
// SPDX-License-Identifier: Apache-2.0 | ||
// SPDX-FileCopyrightText: Copyright The Lance Authors | ||
|
||
|
||
syntax = "proto3"; | ||
|
||
package lance.encodings_datafusion; | ||
|
||
import "encodings.proto"; | ||
|
||
// ZoneMaps are a way to wrap any leaf array with a set of zone maps that | ||
// can be used to apply pushdown filtering. | ||
// | ||
// A "zone map" is the min/max/null_count of a set of rows. This can be | ||
// used to quickly filter out zones which are not included in simple predicates | ||
// like 'x = 5' or 'x > 10'. | ||
message ZoneMaps { | ||
|
||
// How many rows are covered by each zone map. There will be | ||
// ceil_div(num_rows, rows_per_map) zone maps. | ||
uint32 rows_per_map = 1; | ||
|
||
// The zone maps are encoded as struct arrays with 1 row per zone. This | ||
// should be stored in a column metadata buffer. The struct array should | ||
// have 3 children: min: T, max: T, null_count: u32 | ||
lance.encodings.ArrayEncoding stats = 2; | ||
|
||
// The underlying array values | ||
lance.encodings.ArrayEncoding values = 5; | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
[package] | ||
name = "lance-encoding-datafusion" | ||
version.workspace = true | ||
edition.workspace = true | ||
authors.workspace = true | ||
license.workspace = true | ||
repository.workspace = true | ||
readme = "README.md" | ||
description = "Encoders and decoders for the Lance file format that rely on datafusion" | ||
keywords.workspace = true | ||
categories.workspace = true | ||
rust-version.workspace = true | ||
|
||
[dependencies] | ||
lance-core = { workspace = true, features = ["datafusion"] } | ||
lance-encoding.workspace = true | ||
lance-file.workspace = true | ||
arrow-array.workspace = true | ||
arrow-buffer.workspace = true | ||
arrow-schema.workspace = true | ||
datafusion-common.workspace = true | ||
datafusion-expr.workspace = true | ||
datafusion-physical-expr.workspace = true | ||
futures.workspace = true | ||
prost.workspace = true | ||
prost-types.workspace = true | ||
|
||
[dev-dependencies] | ||
rand.workspace = true | ||
tokio.workspace = true | ||
lance-datagen.workspace = true | ||
|
||
[build-dependencies] | ||
prost-build.workspace = true | ||
|
||
[target.'cfg(target_os = "linux")'.dev-dependencies] | ||
pprof = { workspace = true } |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
# lance-encoding-datafusion | ||
|
||
`lance-encoding-datafusion` is an internal sub-crate, containing encoders and | ||
decoders for the Lance file format that rely on Datafusion. Partly this is to | ||
keep the size of `lance-encoding` small and partly this is to prove that | ||
encodings are extensible. | ||
|
||
**Important Note**: This crate is **not intended for external usage**. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
// SPDX-License-Identifier: Apache-2.0 | ||
// SPDX-FileCopyrightText: Copyright The Lance Authors | ||
|
||
use std::io::Result; | ||
|
||
fn main() -> Result<()> { | ||
println!("cargo:rerun-if-changed=protos"); | ||
|
||
let mut prost_build = prost_build::Config::new(); | ||
prost_build.extern_path(".lance.encodings", "::lance_encoding::format::pb"); | ||
prost_build.protoc_arg("--experimental_allow_proto3_optional"); | ||
prost_build.enable_type_names(); | ||
prost_build.compile_protos(&["./protos/encodings-df.proto"], &["./protos"])?; | ||
|
||
Ok(()) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
../../protos |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
// SPDX-License-Identifier: Apache-2.0 | ||
// SPDX-FileCopyrightText: Copyright The Lance Authors | ||
|
||
/// Protobuf definitions for encodings | ||
pub mod pb { | ||
#![allow(clippy::all)] | ||
#![allow(non_upper_case_globals)] | ||
#![allow(non_camel_case_types)] | ||
#![allow(non_snake_case)] | ||
#![allow(unused)] | ||
#![allow(improper_ctypes)] | ||
#![allow(clippy::upper_case_acronyms)] | ||
#![allow(clippy::use_self)] | ||
include!(concat!(env!("OUT_DIR"), "/lance.encodings_datafusion.rs")); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
// SPDX-License-Identifier: Apache-2.0 | ||
// SPDX-FileCopyrightText: Copyright The Lance Authors | ||
|
||
use arrow_schema::DataType; | ||
use lance_encoding::encoder::{ | ||
ColumnIndexSequence, CoreFieldEncodingStrategy, FieldEncodingStrategy, | ||
}; | ||
use zone::ZoneMapsFieldEncoder; | ||
|
||
pub mod format; | ||
pub mod zone; | ||
|
||
/// Wraps the core encoding strategy and adds the encoders from this | ||
/// crate | ||
#[derive(Debug)] | ||
pub struct LanceDfFieldEncodingStrategy { | ||
core: CoreFieldEncodingStrategy, | ||
rows_per_map: u32, | ||
} | ||
|
||
impl FieldEncodingStrategy for LanceDfFieldEncodingStrategy { | ||
fn create_field_encoder( | ||
&self, | ||
encoding_strategy_root: &dyn FieldEncodingStrategy, | ||
field: &lance_core::datatypes::Field, | ||
column_index: &mut ColumnIndexSequence, | ||
cache_bytes_per_column: u64, | ||
keep_original_array: bool, | ||
config: &std::collections::HashMap<String, String>, | ||
) -> lance_core::Result<Box<dyn lance_encoding::encoder::FieldEncoder>> { | ||
let data_type = field.data_type(); | ||
if data_type.is_primitive() | ||
|| matches!( | ||
data_type, | ||
DataType::Boolean | DataType::Utf8 | DataType::LargeUtf8 | ||
) | ||
{ | ||
let inner_encoder = self.core.create_field_encoder( | ||
// Don't collect stats on inner string fields | ||
&self.core, | ||
field, | ||
column_index, | ||
cache_bytes_per_column, | ||
keep_original_array, | ||
config, | ||
)?; | ||
Ok(Box::new(ZoneMapsFieldEncoder::try_new( | ||
inner_encoder, | ||
data_type.clone(), | ||
self.rows_per_map, | ||
)?)) | ||
} else { | ||
self.core.create_field_encoder( | ||
encoding_strategy_root, | ||
field, | ||
column_index, | ||
cache_bytes_per_column, | ||
keep_original_array, | ||
config, | ||
) | ||
} | ||
} | ||
} |
Oops, something went wrong.