Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Datastore revamp 1: new indexing model & core datastructures #1727

Merged
merged 1 commit into from
Apr 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,10 @@ polars-core = "0.27.1"
polars-lazy = "0.27.1"
polars-ops = "0.27.1"
puffin = "0.14"
smallvec = { version = "1.0", features = ["const_generics", "union"] }
thiserror = "1.0"
time = { version = "0.3", features = ["wasm-bindgen"] }
tinyvec = { version = "1.6", features = ["alloc", "rustc_1_55"] }
tokio = "1.24"
wgpu = { version = "0.15.1", default-features = false }
wgpu-core = { version = "0.15.1", default-features = false }
Expand Down
8 changes: 4 additions & 4 deletions crates/re_arrow_store/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,16 +38,17 @@ re_log.workspace = true

# External dependencies:
ahash.workspace = true
anyhow.workspace = true
arrow2 = { workspace = true, features = [
"compute_concatenate",
"compute_aggregate",
] }
arrow2_convert.workspace = true
document-features = "0.2"
indent = "0.1"
itertools = { workspace = true }
nohash-hasher = "0.2"
parking_lot.workspace = true
smallvec.workspace = true
static_assertions = "1.1"
thiserror.workspace = true

Expand All @@ -73,6 +74,7 @@ polars-ops = { workspace = true, optional = true, features = [


[dev-dependencies]
anyhow.workspace = true
criterion = "0.4"
mimalloc.workspace = true
polars-core = { workspace = true, features = [
Expand All @@ -85,9 +87,7 @@ polars-core = { workspace = true, features = [
"sort_multiple",
] }
rand = "0.8"
smallvec = { version = "1.0", features = ["const_generics", "union"] }
tinyvec = { version = "1.6", features = ["alloc", "rustc_1_55"] }

tinyvec.workspace = true

[lib]
bench = false
Expand Down
53 changes: 22 additions & 31 deletions crates/re_arrow_store/benches/data_store.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
#[global_allocator]
static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;

use arrow2::array::{Array, UnionArray};
use arrow2::array::UnionArray;
use criterion::{criterion_group, criterion_main, Criterion};

use re_arrow_store::{DataStore, DataStoreConfig, LatestAtQuery, RangeQuery, TimeInt, TimeRange};
use re_log_types::{
component_types::{InstanceKey, Rect2D},
datagen::{build_frame_nr, build_some_instances, build_some_rects},
Component as _, ComponentName, DataRow, DataTable, EntityPath, MsgId, TimeType, Timeline,
Component as _, ComponentName, DataCell, DataRow, DataTable, EntityPath, MsgId, TimeType,
Timeline,
};

criterion_group!(benches, insert, latest_at, latest_at_missing, range);
Expand Down Expand Up @@ -73,10 +74,7 @@ fn insert(c: &mut Criterion) {
b.iter(|| {
insert_table(
DataStoreConfig {
index_bucket_nb_rows: num_rows_per_bucket,
component_bucket_nb_rows: num_rows_per_bucket,
index_bucket_size_bytes: u64::MAX,
component_bucket_size_bytes: u64::MAX,
indexed_bucket_num_rows: num_rows_per_bucket,
..Default::default()
},
InstanceKey::name(),
Expand All @@ -101,10 +99,11 @@ fn latest_at(c: &mut Criterion) {
group.bench_function("default", |b| {
let store = insert_table(Default::default(), InstanceKey::name(), &table);
b.iter(|| {
let results = latest_data_at(&store, Rect2D::name(), &[Rect2D::name()]);
let rects = results[0]
let cells = latest_data_at(&store, Rect2D::name(), &[Rect2D::name()]);
let rects = cells[0]
.as_ref()
.unwrap()
.as_arrow_ref()
.as_any()
.downcast_ref::<UnionArray>()
.unwrap();
Expand All @@ -116,21 +115,19 @@ fn latest_at(c: &mut Criterion) {
for &num_rows_per_bucket in num_rows_per_bucket() {
let store = insert_table(
DataStoreConfig {
index_bucket_nb_rows: num_rows_per_bucket,
component_bucket_nb_rows: num_rows_per_bucket,
index_bucket_size_bytes: u64::MAX,
component_bucket_size_bytes: u64::MAX,
indexed_bucket_num_rows: num_rows_per_bucket,
..Default::default()
},
InstanceKey::name(),
&table,
);
group.bench_function(format!("bucketsz={num_rows_per_bucket}"), |b| {
b.iter(|| {
let results = latest_data_at(&store, Rect2D::name(), &[Rect2D::name()]);
let rects = results[0]
let cells = latest_data_at(&store, Rect2D::name(), &[Rect2D::name()]);
let rects = cells[0]
.as_ref()
.unwrap()
.as_arrow_ref()
.as_any()
.downcast_ref::<UnionArray>()
.unwrap();
Expand Down Expand Up @@ -180,10 +177,7 @@ fn latest_at_missing(c: &mut Criterion) {
for &num_rows_per_bucket in num_rows_per_bucket() {
let store = insert_table(
DataStoreConfig {
index_bucket_nb_rows: num_rows_per_bucket,
component_bucket_nb_rows: num_rows_per_bucket,
index_bucket_size_bytes: u64::MAX,
component_bucket_size_bytes: u64::MAX,
indexed_bucket_num_rows: num_rows_per_bucket,
..Default::default()
},
InstanceKey::name(),
Expand Down Expand Up @@ -236,25 +230,23 @@ fn range(c: &mut Criterion) {
for &num_rows_per_bucket in num_rows_per_bucket() {
let store = insert_table(
DataStoreConfig {
index_bucket_nb_rows: num_rows_per_bucket,
component_bucket_nb_rows: num_rows_per_bucket,
index_bucket_size_bytes: u64::MAX,
component_bucket_size_bytes: u64::MAX,
indexed_bucket_num_rows: num_rows_per_bucket,
..Default::default()
},
InstanceKey::name(),
&table,
);
group.bench_function(format!("bucketsz={num_rows_per_bucket}"), |b| {
b.iter(|| {
let msgs = range_data(&store, [Rect2D::name()]);
for (cur_time, (time, results)) in msgs.enumerate() {
let rows = range_data(&store, [Rect2D::name()]);
for (cur_time, (time, cells)) in rows.enumerate() {
let time = time.unwrap();
assert_eq!(cur_time as i64, time.as_i64());

let rects = results[0]
let rects = cells[0]
.as_ref()
.unwrap()
.as_arrow_ref()
.as_any()
.downcast_ref::<UnionArray>()
.unwrap();
Expand Down Expand Up @@ -305,26 +297,25 @@ fn latest_data_at<const N: usize>(
store: &DataStore,
primary: ComponentName,
secondaries: &[ComponentName; N],
) -> [Option<Box<dyn Array>>; N] {
) -> [Option<DataCell>; N] {
let timeline_frame_nr = Timeline::new("frame_nr", TimeType::Sequence);
let timeline_query = LatestAtQuery::new(timeline_frame_nr, (NUM_ROWS / 2).into());
let ent_path = EntityPath::from("rects");

let row_indices = store
store
.latest_at(&timeline_query, &ent_path, primary, secondaries)
.unwrap_or_else(|| [(); N].map(|_| None));
store.get(secondaries, &row_indices)
.unwrap_or_else(|| [(); N].map(|_| None))
}

fn range_data<const N: usize>(
store: &DataStore,
components: [ComponentName; N],
) -> impl Iterator<Item = (Option<TimeInt>, [Option<Box<dyn Array>>; N])> + '_ {
) -> impl Iterator<Item = (Option<TimeInt>, [Option<DataCell>; N])> + '_ {
let timeline_frame_nr = Timeline::new("frame_nr", TimeType::Sequence);
let query = RangeQuery::new(timeline_frame_nr, TimeRange::new(0.into(), NUM_ROWS.into()));
let ent_path = EntityPath::from("rects");

store
.range(&query, &ent_path, components)
.map(move |(time, _, row_indices)| (time, store.get(&components, &row_indices)))
.map(move |(time, _, cells)| (time, cells))
}
7 changes: 2 additions & 5 deletions crates/re_arrow_store/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,17 +33,14 @@ pub mod polars_util;
pub mod test_util;

pub use self::arrow_util::ArrayExt;
pub use self::store::{
DataStore, DataStoreConfig, IndexBucket, IndexRowNr, IndexTable, RowIndex, RowIndexKind,
};
pub use self::store::{DataStore, DataStoreConfig};
pub use self::store_gc::GarbageCollectionTarget;
pub use self::store_read::{LatestAtQuery, RangeQuery};
pub use self::store_stats::DataStoreStats;
pub use self::store_write::{WriteError, WriteResult};

pub(crate) use self::store::{
ComponentBucket, ComponentTable, IndexBucketIndices, PersistentComponentTable,
PersistentIndexTable, SecondaryIndex, TimeIndex,
IndexedBucket, IndexedBucketInner, IndexedTable, PersistentIndexedTable,
};

// Re-exports
Expand Down
36 changes: 18 additions & 18 deletions crates/re_arrow_store/src/polars_util.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
use arrow2::array::Array;
use itertools::Itertools;
use polars_core::{prelude::*, series::Series};
use polars_ops::prelude::*;
use re_log_types::{ComponentName, EntityPath, TimeInt};
use re_log_types::{ComponentName, DataCell, EntityPath, TimeInt};

use crate::{ArrayExt, DataStore, LatestAtQuery, RangeQuery};

Expand Down Expand Up @@ -38,12 +37,11 @@ pub fn latest_component(
let cluster_key = store.cluster_key();

let components = &[cluster_key, primary];
let row_indices = store
let cells = store
.latest_at(query, ent_path, primary, components)
.unwrap_or([None; 2]);
let results = store.get(components, &row_indices);
.unwrap_or([(); 2].map(|_| None));

dataframe_from_results(components, results)
dataframe_from_cells(&cells)
}

/// Queries any number of components and their cluster keys from their respective point-of-views,
Expand Down Expand Up @@ -161,12 +159,11 @@ pub fn range_components<'a, const N: usize>(
.chain(
store
.range(query, ent_path, components)
.map(move |(time, _, row_indices)| {
let results = store.get(&components, &row_indices);
.map(move |(time, _, cells)| {
(
time,
row_indices[primary_col].is_some(), // is_primary
dataframe_from_results(&components, results),
cells[primary_col].is_some(), // is_primary
dataframe_from_cells(&cells),
)
}),
)
Expand Down Expand Up @@ -200,16 +197,19 @@ pub fn range_components<'a, const N: usize>(

// --- Joins ---

pub fn dataframe_from_results<const N: usize>(
components: &[ComponentName; N],
results: [Option<Box<dyn Array>>; N],
// TODO(#1619): none of this mess should be here

pub fn dataframe_from_cells<const N: usize>(
cells: &[Option<DataCell>; N],
) -> SharedResult<DataFrame> {
let series: Result<Vec<_>, _> = components
let series: Result<Vec<_>, _> = cells
.iter()
.zip(results)
.filter_map(|(component, col)| col.map(|col| (component, col)))
.map(|(&component, col)| {
Series::try_from((component.as_str(), col.as_ref().clean_for_polars()))
.flatten()
.map(|cell| {
Series::try_from((
cell.component_name().as_str(),
cell.as_arrow_ref().clean_for_polars(),
))
})
.collect();

Expand Down
Loading