Skip to content

Commit 32b385b

Browse files
authored
[Variant] VariantArrayBuilder uses MetadataBuilder and ValueBuilder (#8206)
# Which issue does this PR close? - Closes #8205 # Rationale for this change `VariantArrayBuilder` had a very complex choreography with the `VariantBuilder` API, that required lots of manual drop glue to deal with ownership transfers between it and the `VariantArrayVariantBuilder` it delegates the actual work to. Rework the whole thing to use a (now-reusable) `MetadataBuilder` and `ValueBuilder`, with rollbacks largely handled by `ParentState` -- just like the other builders in the parquet-variant crate. # What changes are included in this PR? Five changes (curated as five commits that reviewers may want to examine individually): 1. Make a bunch of parquet-variant builder infrastructure public, so that `VariantArrayBuilder` can access it from the parquet-variant-compute crate. 2. Make `MetadataBuilder` reusable. Its `finish` method appends the bytes of a new serialized metadata dictionary to the underlying buffer and resets the remaining builder state. The builder is thus ready to create a brand new metadata dictionary whose serialized bytes will also be appended to the underlying buffer once finished. 3. Rework `VariantArrayBuilder` to use `MetadataBuilder` and `ValueBuilder`, coordinated via `ParentState`. This is the main feature of the PR and also the most complicated/subtle. 4. Delete now-unused code that had been added previously in order to support the old implementation of `VariantArrayBuilder`. 5. Add missing doc comments for now-public types and methods # Are these changes tested? Existing variant array builder tests cover the change. # Are there any user-facing changes? A lot of builder-related types and methods from the parquet-variant crate are now public.
1 parent 0c4e58f commit 32b385b

File tree

2 files changed

+111
-297
lines changed

2 files changed

+111
-297
lines changed

parquet-variant-compute/src/variant_array_builder.rs

Lines changed: 51 additions & 114 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@
2020
use crate::VariantArray;
2121
use arrow::array::{ArrayRef, BinaryViewArray, BinaryViewBuilder, NullBufferBuilder, StructArray};
2222
use arrow_schema::{ArrowError, DataType, Field, Fields};
23-
use parquet_variant::{ListBuilder, ObjectBuilder, Variant, VariantBuilder, VariantBuilderExt};
23+
use parquet_variant::{ListBuilder, ObjectBuilder, Variant, VariantBuilderExt};
24+
use parquet_variant::{MetadataBuilder, ParentState, ValueBuilder};
2425
use std::sync::Arc;
2526

2627
/// A builder for [`VariantArray`]
@@ -72,12 +73,12 @@ use std::sync::Arc;
7273
pub struct VariantArrayBuilder {
7374
/// Nulls
7475
nulls: NullBufferBuilder,
75-
/// buffer for all the metadata
76-
metadata_buffer: Vec<u8>,
76+
/// builder for all the metadata
77+
metadata_builder: MetadataBuilder,
7778
/// ending offset for each serialized metadata dictionary in the buffer
7879
metadata_offsets: Vec<usize>,
79-
/// buffer for values
80-
value_buffer: Vec<u8>,
80+
/// builder for values
81+
value_builder: ValueBuilder,
8182
/// ending offset for each serialized variant value in the buffer
8283
value_offsets: Vec<usize>,
8384
/// The fields of the final `StructArray`
@@ -95,9 +96,9 @@ impl VariantArrayBuilder {
9596

9697
Self {
9798
nulls: NullBufferBuilder::new(row_capacity),
98-
metadata_buffer: Vec::new(), // todo allocation capacity
99+
metadata_builder: MetadataBuilder::default(),
99100
metadata_offsets: Vec::with_capacity(row_capacity),
100-
value_buffer: Vec::new(),
101+
value_builder: ValueBuilder::new(),
101102
value_offsets: Vec::with_capacity(row_capacity),
102103
fields: Fields::from(vec![metadata_field, value_field]),
103104
}
@@ -107,15 +108,17 @@ impl VariantArrayBuilder {
107108
pub fn build(self) -> VariantArray {
108109
let Self {
109110
mut nulls,
110-
metadata_buffer,
111+
metadata_builder,
111112
metadata_offsets,
112-
value_buffer,
113+
value_builder,
113114
value_offsets,
114115
fields,
115116
} = self;
116117

118+
let metadata_buffer = metadata_builder.into_inner();
117119
let metadata_array = binary_view_array_from_buffers(metadata_buffer, metadata_offsets);
118120

121+
let value_buffer = value_builder.into_inner();
119122
let value_array = binary_view_array_from_buffers(value_buffer, value_offsets);
120123

121124
// The build the final struct array
@@ -136,14 +139,14 @@ impl VariantArrayBuilder {
136139
pub fn append_null(&mut self) {
137140
self.nulls.append_null();
138141
// The subfields are expected to be non-nullable according to the parquet variant spec.
139-
self.metadata_offsets.push(self.metadata_buffer.len());
140-
self.value_offsets.push(self.value_buffer.len());
142+
self.metadata_offsets.push(self.metadata_builder.offset());
143+
self.value_offsets.push(self.value_builder.offset());
141144
}
142145

143146
/// Append the [`Variant`] to the builder as the next row
144147
pub fn append_variant(&mut self, variant: Variant) {
145148
let mut direct_builder = self.variant_builder();
146-
direct_builder.variant_builder.append_value(variant);
149+
direct_builder.append_value(variant);
147150
direct_builder.finish()
148151
}
149152

@@ -194,32 +197,23 @@ impl VariantArrayBuilder {
194197
///
195198
/// See [`VariantArrayBuilder::variant_builder`] for an example
196199
pub struct VariantArrayVariantBuilder<'a> {
197-
/// was finish called?
198-
finished: bool,
199-
/// starting offset in the variant_builder's `metadata` buffer
200-
metadata_offset: usize,
201-
/// starting offset in the variant_builder's `value` buffer
202-
value_offset: usize,
203-
/// Parent array builder that this variant builder writes to. Buffers
204-
/// have been moved into the variant builder, and must be returned on
205-
/// drop
206-
array_builder: &'a mut VariantArrayBuilder,
207-
/// Builder for the in progress variant value, temporarily owns the buffers
208-
/// from `array_builder`
209-
variant_builder: VariantBuilder,
200+
parent_state: ParentState<'a>,
201+
metadata_offsets: &'a mut Vec<usize>,
202+
value_offsets: &'a mut Vec<usize>,
203+
nulls: &'a mut NullBufferBuilder,
210204
}
211205

212206
impl VariantBuilderExt for VariantArrayVariantBuilder<'_> {
213207
fn append_value<'m, 'v>(&mut self, value: impl Into<Variant<'m, 'v>>) {
214-
self.variant_builder.append_value(value);
208+
ValueBuilder::append_variant(self.parent_state(), value.into());
215209
}
216210

217211
fn try_new_list(&mut self) -> Result<ListBuilder<'_>, ArrowError> {
218-
Ok(self.variant_builder.new_list())
212+
Ok(ListBuilder::new(self.parent_state(), false))
219213
}
220214

221215
fn try_new_object(&mut self) -> Result<ObjectBuilder<'_>, ArrowError> {
222-
Ok(self.variant_builder.new_object())
216+
Ok(ObjectBuilder::new(self.parent_state(), false))
223217
}
224218
}
225219

@@ -228,103 +222,40 @@ impl<'a> VariantArrayVariantBuilder<'a> {
228222
///
229223
/// Note this is not public as this is a structure that is logically
230224
/// part of the [`VariantArrayBuilder`] and relies on its internal structure
231-
fn new(array_builder: &'a mut VariantArrayBuilder) -> Self {
232-
// append directly into the metadata and value buffers
233-
let metadata_buffer = std::mem::take(&mut array_builder.metadata_buffer);
234-
let value_buffer = std::mem::take(&mut array_builder.value_buffer);
235-
let metadata_offset = metadata_buffer.len();
236-
let value_offset = value_buffer.len();
225+
fn new(builder: &'a mut VariantArrayBuilder) -> Self {
226+
let parent_state =
227+
ParentState::variant(&mut builder.value_builder, &mut builder.metadata_builder);
237228
VariantArrayVariantBuilder {
238-
finished: false,
239-
metadata_offset,
240-
value_offset,
241-
variant_builder: VariantBuilder::new_with_buffers(metadata_buffer, value_buffer),
242-
array_builder,
229+
parent_state,
230+
metadata_offsets: &mut builder.metadata_offsets,
231+
value_offsets: &mut builder.value_offsets,
232+
nulls: &mut builder.nulls,
243233
}
244234
}
245235

246-
/// Return a reference to the underlying `VariantBuilder`
247-
pub fn inner(&self) -> &VariantBuilder {
248-
&self.variant_builder
249-
}
250-
251-
/// Return a mutable reference to the underlying `VariantBuilder`
252-
pub fn inner_mut(&mut self) -> &mut VariantBuilder {
253-
&mut self.variant_builder
254-
}
255-
256236
/// Called to finish the in progress variant and write it to the underlying
257237
/// buffers
258238
///
259239
/// Note if you do not call finish, on drop any changes made to the
260240
/// underlying buffers will be rolled back.
261241
pub fn finish(mut self) {
262-
self.finished = true;
263-
264-
let metadata_offset = self.metadata_offset;
265-
let value_offset = self.value_offset;
266-
// get the buffers back from the variant builder
267-
let (metadata_buffer, value_buffer) = std::mem::take(&mut self.variant_builder).finish();
268-
269-
// Sanity Check: if the buffers got smaller, something went wrong (previous data was lost)
270-
assert!(
271-
metadata_offset <= metadata_buffer.len(),
272-
"metadata length decreased unexpectedly"
273-
);
274-
assert!(
275-
value_offset <= value_buffer.len(),
276-
"value length decreased unexpectedly"
277-
);
278-
279-
// commit the changes by putting the
280-
// ending offsets into the parent array builder.
281-
let builder = &mut self.array_builder;
282-
builder.metadata_offsets.push(metadata_buffer.len());
283-
builder.value_offsets.push(value_buffer.len());
284-
builder.nulls.append_non_null();
242+
// Record the ending offsets after finishing metadata and finish the parent state.
243+
let (value_builder, metadata_builder) = self.parent_state.value_and_metadata_builders();
244+
self.metadata_offsets.push(metadata_builder.finish());
245+
self.value_offsets.push(value_builder.offset());
246+
self.nulls.append_non_null();
247+
self.parent_state.finish();
248+
}
285249

286-
// put the buffers back into the array builder
287-
builder.metadata_buffer = metadata_buffer;
288-
builder.value_buffer = value_buffer;
250+
fn parent_state(&mut self) -> ParentState<'_> {
251+
let (value_builder, metadata_builder) = self.parent_state.value_and_metadata_builders();
252+
ParentState::variant(value_builder, metadata_builder)
289253
}
290254
}
291255

256+
// Empty Drop to help with borrow checking - warns users if they forget to call finish()
292257
impl Drop for VariantArrayVariantBuilder<'_> {
293-
/// If the builder was not finished, roll back any changes made to the
294-
/// underlying buffers (by truncating them)
295-
fn drop(&mut self) {
296-
if self.finished {
297-
return;
298-
}
299-
300-
// if the object was not finished, need to rollback any changes by
301-
// truncating the buffers to the original offsets
302-
let metadata_offset = self.metadata_offset;
303-
let value_offset = self.value_offset;
304-
305-
// get the buffers back from the variant builder
306-
let (mut metadata_buffer, mut value_buffer) =
307-
std::mem::take(&mut self.variant_builder).into_buffers();
308-
309-
// Sanity Check: if the buffers got smaller, something went wrong (previous data was lost) so panic immediately
310-
metadata_buffer
311-
.len()
312-
.checked_sub(metadata_offset)
313-
.expect("metadata length decreased unexpectedly");
314-
value_buffer
315-
.len()
316-
.checked_sub(value_offset)
317-
.expect("value length decreased unexpectedly");
318-
319-
// Note this truncate is fast because truncate doesn't free any memory:
320-
// it just has to drop elements (and u8 doesn't have a destructor)
321-
metadata_buffer.truncate(metadata_offset);
322-
value_buffer.truncate(value_offset);
323-
324-
// put the buffers back into the array builder
325-
self.array_builder.metadata_buffer = metadata_buffer;
326-
self.array_builder.value_buffer = value_buffer;
327-
}
258+
fn drop(&mut self) {}
328259
}
329260

330261
fn binary_view_array_from_buffers(buffer: Vec<u8>, offsets: Vec<usize>) -> BinaryViewArray {
@@ -457,12 +388,18 @@ mod test {
457388
assert_eq!(variant_array.len(), 2);
458389
assert!(!variant_array.is_null(0));
459390
let variant = variant_array.value(0);
460-
let variant = variant.as_object().expect("variant to be an object");
461-
assert_eq!(variant.get("foo").unwrap(), Variant::from(1i32));
391+
assert_eq!(
392+
variant.get_object_field("foo"),
393+
Some(Variant::from(1i32)),
394+
"Expected an object with field \"foo\", got: {variant:?}"
395+
);
462396

463397
assert!(!variant_array.is_null(1));
464398
let variant = variant_array.value(1);
465-
let variant = variant.as_object().expect("variant to be an object");
466-
assert_eq!(variant.get("baz").unwrap(), Variant::from(3i32));
399+
assert_eq!(
400+
variant.get_object_field("baz"),
401+
Some(Variant::from(3i32)),
402+
"Expected an object with field \"baz\", got: {variant:?}"
403+
);
467404
}
468405
}

0 commit comments

Comments
 (0)