Skip to content

Commit

Permalink
Remove utils functions no longer needed
Browse files Browse the repository at this point in the history
  • Loading branch information
westonpace committed Jul 26, 2024
1 parent 5568fed commit d00f9bd
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 6 deletions.
21 changes: 15 additions & 6 deletions rust/lance-encoding/src/buffer.rs
Original file line number Diff line number Diff line change
@@ -1,37 +1,46 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors

//! Utilities for byte arrays

use std::{ops::Deref, ptr::NonNull, sync::Arc};

use arrow_buffer::Buffer;

// A copy-on-write version of Buffer / MutableBuffer or Bytes / BytesMut
//
// It can be created from read-only buffers (e.g. bytes::Bytes or arrow-rs' Buffer), e.g. "borrowed"
// or from writeable buffers (e.g. Vec<u8>, arrow-rs' MutableBuffer, or bytes::BytesMut), e.g. "owned"
/// A copy-on-write byte buffer
///
/// It can be created from read-only buffers (e.g. bytes::Bytes or arrow_buffer::Buffer), e.g. "borrowed"
/// or from writeable buffers (e.g. Vec<u8>), e.g. "owned"
#[derive(Debug)]
pub enum LanceBuffer {
Borrowed(Buffer),
Owned(Vec<u8>),
}

impl LanceBuffer {
// Convert into a mutable buffer. If this is a borrowed buffer, the data will be copied.
/// Convert into a mutable buffer. If this is a borrowed buffer, the data will be copied.
pub fn into_owned(self) -> Vec<u8> {
match self {
LanceBuffer::Borrowed(buffer) => buffer.to_vec(),
LanceBuffer::Owned(buffer) => buffer,
}
}

// Convert into an Arrow buffer. Never copies data.
/// Convert into an Arrow buffer. Never copies data.
pub fn into_buffer(self) -> Buffer {
match self {
LanceBuffer::Borrowed(buffer) => buffer,
LanceBuffer::Owned(buffer) => Buffer::from_vec(buffer),
}
}

/// Create a LanceBuffer from a bytes::Bytes object
///
/// The alignment must be specified (as `bytes_per_value`) since we want to make
/// sure we can safely reinterpret the buffer.
///
/// If the buffer is properly aligned this will be zero-copy. If not, a copy
/// will be made and an owned buffer returned.
pub fn from_bytes(bytes: bytes::Bytes, bytes_per_value: u64) -> LanceBuffer {
if bytes.as_ptr().align_offset(bytes_per_value as usize) != 0 {
// The original buffer is not aligned, cannot zero-copy
Expand Down
34 changes: 34 additions & 0 deletions rust/lance-encoding/src/data.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors

//! Data layouts to represent encoded data in a sub-Arrow format

use std::any::Any;

use arrow::array::{ArrayData, ArrayDataBuilder};
Expand All @@ -11,13 +13,29 @@ use lance_core::{Error, Result};

use crate::buffer::LanceBuffer;

/// A DataBlock is a collection of buffers that represents an "array" of data in very generic terms
///
/// The output of each decoder is a DataBlock. Decoders can be chained together to transform
/// one DataBlock into a different kind of DataBlock.
///
/// The DataBlock is somewhere in between Arrow's ArrayData and Array and represents a physical
/// layout of the data.
///
/// A DataBlock can be converted into an Arrow ArrayData (and then Array) for a given array type.
/// For example, a FixedWidthDataBlock can be converted into any primitive type or a fixed size
/// list of a primitive type.
pub trait DataBlock: Any {
/// Get a reference to the Any trait object
fn as_any(&self) -> &dyn Any;
/// Convert self into a Box<dyn Any>
fn as_any_box(self: Box<Self>) -> Box<dyn Any>;
/// Convert self into an Arrow ArrayData
fn into_arrow(self: Box<Self>, data_type: DataType, validate: bool) -> Result<ArrayData>;
}

/// Extension trait for DataBlock
pub trait DataBlockExt {
/// Try to convert a DataBlock into a specific layout
fn try_into_layout<T: DataBlock>(self) -> Result<Box<T>>;
}

Expand All @@ -32,7 +50,9 @@ impl DataBlockExt for Box<dyn DataBlock> {
}
}

/// A data block with no buffers where everything is null
pub struct AllNullDataBlock {
/// The number of values represented by this block
pub num_values: u64,
}

Expand All @@ -50,8 +70,11 @@ impl DataBlock for AllNullDataBlock {
}
}

/// Wraps a data block and adds nullability information to it
pub struct NullableDataBlock {
/// The underlying data
pub data: Box<dyn DataBlock>,
/// A bitmap of validity for each value
pub nulls: LanceBuffer,
}

Expand All @@ -76,9 +99,13 @@ impl DataBlock for NullableDataBlock {
}
}

/// A data block for a single buffer of data where each element has a fixed number of bits
pub struct FixedWidthDataBlock {
/// The data buffer
pub data: LanceBuffer,
/// The number of bits per value
pub bits_per_value: u64,
/// The number of values represented by this block
pub num_values: u64,
}

Expand Down Expand Up @@ -130,10 +157,15 @@ impl DataBlock for FixedWidthDataBlock {
}
}

/// A data block for variable-width data (e.g. strings, packed rows, etc.)
pub struct VariableWidthBlock {
/// The data buffer
pub data: LanceBuffer,
/// The offsets buffer (contains num_values + 1 offsets)
pub offsets: LanceBuffer,
/// The number of bits per offset
pub bits_per_offset: u32,
/// The number of values represented by this block
pub num_values: u64,
}

Expand Down Expand Up @@ -162,7 +194,9 @@ impl DataBlock for VariableWidthBlock {
}
}

/// A data block representing a struct
pub struct StructDataBlock {
/// The child arrays
pub children: Vec<Box<dyn DataBlock>>,
}

Expand Down

0 comments on commit d00f9bd

Please sign in to comment.