Skip to content

Commit

Permalink
feat: add encoder utilities for pushdown (#2388)
Browse files Browse the repository at this point in the history
This adds a new field encoder (ZoneMapFieldEncoder) that calculates
pushdown statistics and places them in the metadata.

It also changes the encoder so that it the choice of encoding is
configurable. This makes it possible for extensions to register custom
encodings. The zone maps encoder is an example of this as it is placed
in a special crate for "encodings that rely on datafusion".

It also adds some utilities for converting an `EncodedBatch` to `Bytes`
according to the lance file format. This makes it possible to go from
`RecordBatch` to `Bytes` using the lance file format.

There is not much testing for the zone maps encoder. More will come when
we add support for reading zone maps but I want to keep this PR simple
for now.
  • Loading branch information
westonpace authored May 28, 2024
1 parent 35c066a commit d8da445
Show file tree
Hide file tree
Showing 19 changed files with 791 additions and 160 deletions.
2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ members = [
"rust/lance-core",
"rust/lance-datagen",
"rust/lance-encoding",
"rust/lance-encoding-datafusion",
"rust/lance-file",
"rust/lance-index",
"rust/lance-io",
Expand Down Expand Up @@ -48,6 +49,7 @@ lance-core = { version = "=0.11.1", path = "./rust/lance-core" }
lance-datafusion = { version = "=0.11.1", path = "./rust/lance-datafusion" }
lance-datagen = { version = "=0.11.1", path = "./rust/lance-datagen" }
lance-encoding = { version = "=0.11.1", path = "./rust/lance-encoding" }
lance-encoding-datafusion = { version = "=0.11.1", path = "./rust/lance-encoding-datafusion" }
lance-file = { version = "=0.11.1", path = "./rust/lance-file" }
lance-index = { version = "=0.11.1", path = "./rust/lance-index" }
lance-io = { version = "=0.11.1", path = "./rust/lance-io" }
Expand Down
31 changes: 31 additions & 0 deletions protos/encodings-df.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors


syntax = "proto3";

package lance.encodings_datafusion;

import "encodings.proto";

// ZoneMaps are a way to wrap any leaf array with a set of zone maps that
// can be used to apply pushdown filtering.
//
// A "zone map" is the min/max/null_count of a set of rows. This can be
// used to quickly filter out zones which are not included in simple predicates
// like 'x = 5' or 'x > 10'.
message ZoneMaps {

// How many rows are covered by each zone map. There will be
// ceil_div(num_rows, rows_per_map) zone maps.
uint32 rows_per_map = 1;

// The zone maps are encoded as struct arrays with 1 row per zone. This
// should be stored in a column metadata buffer. The struct array should
// have 3 children: min: T, max: T, null_count: u32
lance.encodings.ArrayEncoding stats = 2;

// The underlying array values
lance.encodings.ArrayEncoding values = 5;
}

1 change: 1 addition & 0 deletions python/src/file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ impl LanceFileWriter {
FileWriterOptions {
data_cache_bytes,
keep_original_array,
..Default::default()
},
)
.infer_error()?;
Expand Down
37 changes: 37 additions & 0 deletions rust/lance-encoding-datafusion/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
[package]
name = "lance-encoding-datafusion"
version.workspace = true
edition.workspace = true
authors.workspace = true
license.workspace = true
repository.workspace = true
readme = "README.md"
description = "Encoders and decoders for the Lance file format that rely on datafusion"
keywords.workspace = true
categories.workspace = true
rust-version.workspace = true

[dependencies]
lance-core = { workspace = true, features = ["datafusion"] }
lance-encoding.workspace = true
lance-file.workspace = true
arrow-array.workspace = true
arrow-buffer.workspace = true
arrow-schema.workspace = true
datafusion-common.workspace = true
datafusion-expr.workspace = true
datafusion-physical-expr.workspace = true
futures.workspace = true
prost.workspace = true
prost-types.workspace = true

[dev-dependencies]
rand.workspace = true
tokio.workspace = true
lance-datagen.workspace = true

[build-dependencies]
prost-build.workspace = true

[target.'cfg(target_os = "linux")'.dev-dependencies]
pprof = { workspace = true }
8 changes: 8 additions & 0 deletions rust/lance-encoding-datafusion/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# lance-encoding-datafusion

`lance-encoding-datafusion` is an internal sub-crate, containing encoders and
decoders for the Lance file format that rely on Datafusion. Partly this is to
keep the size of `lance-encoding` small and partly this is to prove that
encodings are extensible.

**Important Note**: This crate is **not intended for external usage**.
16 changes: 16 additions & 0 deletions rust/lance-encoding-datafusion/build.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors

use std::io::Result;

fn main() -> Result<()> {
println!("cargo:rerun-if-changed=protos");

let mut prost_build = prost_build::Config::new();
prost_build.extern_path(".lance.encodings", "::lance_encoding::format::pb");
prost_build.protoc_arg("--experimental_allow_proto3_optional");
prost_build.enable_type_names();
prost_build.compile_protos(&["./protos/encodings-df.proto"], &["./protos"])?;

Ok(())
}
1 change: 1 addition & 0 deletions rust/lance-encoding-datafusion/protos
15 changes: 15 additions & 0 deletions rust/lance-encoding-datafusion/src/format.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors

/// Protobuf definitions for encodings
pub mod pb {
#![allow(clippy::all)]
#![allow(non_upper_case_globals)]
#![allow(non_camel_case_types)]
#![allow(non_snake_case)]
#![allow(unused)]
#![allow(improper_ctypes)]
#![allow(clippy::upper_case_acronyms)]
#![allow(clippy::use_self)]
include!(concat!(env!("OUT_DIR"), "/lance.encodings_datafusion.rs"));
}
63 changes: 63 additions & 0 deletions rust/lance-encoding-datafusion/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors

use arrow_schema::DataType;
use lance_encoding::encoder::{
ColumnIndexSequence, CoreFieldEncodingStrategy, FieldEncodingStrategy,
};
use zone::ZoneMapsFieldEncoder;

pub mod format;
pub mod zone;

/// Wraps the core encoding strategy and adds the encoders from this
/// crate
#[derive(Debug)]
pub struct LanceDfFieldEncodingStrategy {
core: CoreFieldEncodingStrategy,
rows_per_map: u32,
}

impl FieldEncodingStrategy for LanceDfFieldEncodingStrategy {
fn create_field_encoder(
&self,
encoding_strategy_root: &dyn FieldEncodingStrategy,
field: &lance_core::datatypes::Field,
column_index: &mut ColumnIndexSequence,
cache_bytes_per_column: u64,
keep_original_array: bool,
config: &std::collections::HashMap<String, String>,
) -> lance_core::Result<Box<dyn lance_encoding::encoder::FieldEncoder>> {
let data_type = field.data_type();
if data_type.is_primitive()
|| matches!(
data_type,
DataType::Boolean | DataType::Utf8 | DataType::LargeUtf8
)
{
let inner_encoder = self.core.create_field_encoder(
// Don't collect stats on inner string fields
&self.core,
field,
column_index,
cache_bytes_per_column,
keep_original_array,
config,
)?;
Ok(Box::new(ZoneMapsFieldEncoder::try_new(
inner_encoder,
data_type.clone(),
self.rows_per_map,
)?))
} else {
self.core.create_field_encoder(
encoding_strategy_root,
field,
column_index,
cache_bytes_per_column,
keep_original_array,
config,
)
}
}
}
Loading

0 comments on commit d8da445

Please sign in to comment.