Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

merge FastFieldCodecReader wit FastFieldDataAccess #1485

Merged
merged 4 commits into from
Aug 27, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions fastfield_codecs/benches/bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,10 @@ mod tests {
fn value_iter() -> impl Iterator<Item = u64> {
0..20_000
}
fn bench_get<S: FastFieldCodecSerializer, R: FastFieldCodecReader>(
fn bench_get<
S: FastFieldCodecSerializer,
R: FastFieldCodecDeserializer + FastFieldDataAccess,
>(
b: &mut Bencher,
data: &[u64],
) {
Expand All @@ -35,7 +38,7 @@ mod tests {
b.iter(|| {
let mut sum = 0u64;
for pos in value_iter() {
let val = reader.get_u64(pos as u64);
let val = reader.get_val(pos as u64);
debug_assert_eq!(data[pos as usize], val);
sum = sum.wrapping_add(val);
}
Expand Down
23 changes: 18 additions & 5 deletions fastfield_codecs/src/bitpacked.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use ownedbytes::OwnedBytes;
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};

use crate::{
FastFieldCodecReader, FastFieldCodecSerializer, FastFieldCodecType, FastFieldDataAccess,
FastFieldCodecDeserializer, FastFieldCodecSerializer, FastFieldCodecType, FastFieldDataAccess,
};

/// Depending on the field type, a different
Expand All @@ -16,27 +16,32 @@ pub struct BitpackedReader {
bit_unpacker: BitUnpacker,
pub min_value_u64: u64,
pub max_value_u64: u64,
pub num_vals: u64,
}

impl FastFieldCodecReader for BitpackedReader {
impl FastFieldCodecDeserializer for BitpackedReader {
Copy link
Collaborator

@fulmicoton fulmicoton Aug 27, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd put that in the XXXXSerializers and rename the trait XXXXCodec, and add an associated trait over the reader.

The XXXXSerializer and the XXXXDeserializer are not really independent.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think don't fully understand. I split the Deserializer from the FastFieldDataAccess, because e.g. in the merge case we provide access without deserializing.

/// Opens a fast field given a file.
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self> {
let footer_offset = bytes.len() - 16;
let footer_offset = bytes.len() - 24;
let (data, mut footer) = bytes.split(footer_offset);
let min_value = u64::deserialize(&mut footer)?;
let amplitude = u64::deserialize(&mut footer)?;
let num_vals = u64::deserialize(&mut footer)?;
let max_value = min_value + amplitude;
let num_bits = compute_num_bits(amplitude);
let bit_unpacker = BitUnpacker::new(num_bits);
Ok(BitpackedReader {
data,
bit_unpacker,
min_value_u64: min_value,
max_value_u64: max_value,
bit_unpacker,
num_vals,
})
}
}
impl FastFieldDataAccess for BitpackedReader {
#[inline]
fn get_u64(&self, doc: u64) -> u64 {
fn get_val(&self, doc: u64) -> u64 {
self.min_value_u64 + self.bit_unpacker.get(doc, &self.data)
}
#[inline]
Expand All @@ -47,11 +52,16 @@ impl FastFieldCodecReader for BitpackedReader {
fn max_value(&self) -> u64 {
self.max_value_u64
}
#[inline]
fn num_vals(&self) -> u64 {
self.num_vals
}
}
pub struct BitpackedSerializerLegacy<'a, W: 'a + Write> {
bit_packer: BitPacker,
write: &'a mut W,
min_value: u64,
num_vals: u64,
amplitude: u64,
num_bits: u8,
}
Expand All @@ -78,6 +88,7 @@ impl<'a, W: Write> BitpackedSerializerLegacy<'a, W> {
bit_packer,
write,
min_value,
num_vals: 0,
amplitude,
num_bits,
})
Expand All @@ -88,12 +99,14 @@ impl<'a, W: Write> BitpackedSerializerLegacy<'a, W> {
let val_to_write: u64 = val - self.min_value;
self.bit_packer
.write(val_to_write, self.num_bits, &mut self.write)?;
self.num_vals += 1;
Ok(())
}
pub fn close_field(mut self) -> io::Result<()> {
self.bit_packer.close(&mut self.write)?;
self.min_value.serialize(&mut self.write)?;
self.amplitude.serialize(&mut self.write)?;
self.num_vals.serialize(&mut self.write)?;
Ok(())
}
}
Expand Down
12 changes: 9 additions & 3 deletions fastfield_codecs/src/blockwise_linear.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};

use crate::linear::{get_calculated_value, get_slope};
use crate::{
FastFieldCodecReader, FastFieldCodecSerializer, FastFieldCodecType, FastFieldDataAccess,
FastFieldCodecDeserializer, FastFieldCodecSerializer, FastFieldCodecType, FastFieldDataAccess,
};

const CHUNK_SIZE: u64 = 512;
Expand Down Expand Up @@ -148,7 +148,7 @@ fn get_interpolation_function(doc: u64, interpolations: &[Function]) -> &Functio
&interpolations[get_interpolation_position(doc)]
}

impl FastFieldCodecReader for BlockwiseLinearReader {
impl FastFieldCodecDeserializer for BlockwiseLinearReader {
/// Opens a fast field given a file.
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self> {
let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?;
Expand All @@ -157,9 +157,11 @@ impl FastFieldCodecReader for BlockwiseLinearReader {
let footer = BlockwiseLinearFooter::deserialize(&mut footer)?;
Ok(BlockwiseLinearReader { data, footer })
}
}

impl FastFieldDataAccess for BlockwiseLinearReader {
#[inline]
fn get_u64(&self, idx: u64) -> u64 {
fn get_val(&self, idx: u64) -> u64 {
let interpolation = get_interpolation_function(idx, &self.footer.interpolations);
let in_block_idx = idx - interpolation.start_pos;
let calculated_value = get_calculated_value(
Expand All @@ -182,6 +184,10 @@ impl FastFieldCodecReader for BlockwiseLinearReader {
fn max_value(&self) -> u64 {
self.footer.max_value
}
#[inline]
fn num_vals(&self) -> u64 {
self.footer.num_vals
}
}

/// Same as LinearSerializer, but working on chunks of CHUNK_SIZE elements.
Expand Down
55 changes: 23 additions & 32 deletions fastfield_codecs/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,21 @@ pub mod bitpacked;
pub mod blockwise_linear;
pub mod linear;

pub trait FastFieldCodecReader: Sized {
/// reads the metadata and returns the CodecReader
fn open_from_bytes(bytes: OwnedBytes) -> std::io::Result<Self>;
fn get_u64(&self, doc: u64) -> u64;
pub trait FastFieldCodecDeserializer: Sized {
/// Reads the metadata and returns the CodecReader
fn open_from_bytes(bytes: OwnedBytes) -> std::io::Result<Self>
where Self: FastFieldDataAccess;
}

pub trait FastFieldDataAccess {
fn get_val(&self, doc: u64) -> u64;
fn min_value(&self) -> u64;
fn max_value(&self) -> u64;
fn num_vals(&self) -> u64;
/// Returns a iterator over the data
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = u64> + 'a> {
Box::new((0..self.num_vals()).map(|idx| self.get_val(idx)))
}
}

#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
Expand Down Expand Up @@ -85,31 +94,6 @@ pub trait FastFieldCodecSerializer {
) -> io::Result<()>;
}

/// FastFieldDataAccess is the trait to access fast field data during serialization and estimation.
pub trait FastFieldDataAccess {
/// Return the value associated to the given position.
///
/// Whenever possible use the Iterator passed to the fastfield creation instead, for performance
/// reasons.
///
/// # Panics
///
/// May panic if `position` is greater than the index.
fn get_val(&self, position: u64) -> u64;

/// Returns a iterator over the data
fn iter(&self) -> Box<dyn Iterator<Item = u64> + '_>;

/// min value of the data
fn min_value(&self) -> u64;

/// max value of the data
fn max_value(&self) -> u64;

/// num vals
fn num_vals(&self) -> u64;
}

#[derive(Debug, Clone)]
/// Statistics are used in codec detection and stored in the fast field footer.
pub struct FastFieldStats {
Expand Down Expand Up @@ -169,7 +153,10 @@ mod tests {
use crate::blockwise_linear::{BlockwiseLinearReader, BlockwiseLinearSerializer};
use crate::linear::{LinearReader, LinearSerializer};

pub fn create_and_validate<S: FastFieldCodecSerializer, R: FastFieldCodecReader>(
pub fn create_and_validate<
S: FastFieldCodecSerializer,
R: FastFieldCodecDeserializer + FastFieldDataAccess,
>(
data: &[u64],
name: &str,
) -> (f32, f32) {
Expand All @@ -183,8 +170,9 @@ mod tests {
let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0);

let reader = R::open_from_bytes(OwnedBytes::new(out)).unwrap();
assert_eq!(reader.num_vals(), data.len() as u64);
for (doc, orig_val) in data.iter().enumerate() {
let val = reader.get_u64(doc as u64);
let val = reader.get_val(doc as u64);
if val != *orig_val {
panic!(
"val {val:?} does not match orig_val {orig_val:?}, in data set {name}, data \
Expand Down Expand Up @@ -228,7 +216,10 @@ mod tests {
data_and_names
}

fn test_codec<S: FastFieldCodecSerializer, R: FastFieldCodecReader>() {
fn test_codec<
S: FastFieldCodecSerializer,
R: FastFieldDataAccess + FastFieldCodecDeserializer,
>() {
let codec_name = format!("{:?}", S::CODEC_TYPE);
for (data, dataset_name) in get_codec_test_data_sets() {
let (estimate, actual) = crate::tests::create_and_validate::<S, R>(&data, dataset_name);
Expand Down
13 changes: 10 additions & 3 deletions fastfield_codecs/src/linear.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use ownedbytes::OwnedBytes;
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};

use crate::{
FastFieldCodecReader, FastFieldCodecSerializer, FastFieldCodecType, FastFieldDataAccess,
FastFieldCodecDeserializer, FastFieldCodecSerializer, FastFieldCodecType, FastFieldDataAccess,
};

/// Depending on the field type, a different
Expand Down Expand Up @@ -59,7 +59,7 @@ impl FixedSize for LinearFooter {
const SIZE_IN_BYTES: usize = 56;
}

impl FastFieldCodecReader for LinearReader {
impl FastFieldCodecDeserializer for LinearReader {
/// Opens a fast field given a file.
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self> {
let footer_offset = bytes.len() - LinearFooter::SIZE_IN_BYTES;
Expand All @@ -75,8 +75,11 @@ impl FastFieldCodecReader for LinearReader {
slope,
})
}
}

impl FastFieldDataAccess for LinearReader {
#[inline]
fn get_u64(&self, doc: u64) -> u64 {
fn get_val(&self, doc: u64) -> u64 {
let calculated_value = get_calculated_value(self.footer.first_val, doc, self.slope);
(calculated_value + self.bit_unpacker.get(doc, &self.data)) - self.footer.offset
}
Expand All @@ -89,6 +92,10 @@ impl FastFieldCodecReader for LinearReader {
fn max_value(&self) -> u64 {
self.footer.max_value
}
#[inline]
fn num_vals(&self) -> u64 {
self.footer.num_vals
}
}

/// Fastfield serializer, which tries to guess values by linear interpolation
Expand Down
29 changes: 22 additions & 7 deletions src/fastfield/gcd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use std::num::NonZeroU64;

use common::BinarySerializable;
use fastdivide::DividerU64;
use fastfield_codecs::FastFieldCodecReader;
use fastfield_codecs::{FastFieldCodecDeserializer, FastFieldDataAccess};
use ownedbytes::OwnedBytes;

pub const GCD_DEFAULT: u64 = 1;
Expand All @@ -15,27 +15,33 @@ pub const GCD_DEFAULT: u64 = 1;
pub struct GCDFastFieldCodec<CodecReader> {
gcd: u64,
min_value: u64,
num_vals: u64,
reader: CodecReader,
}

impl<C: FastFieldCodecReader + Clone> FastFieldCodecReader for GCDFastFieldCodec<C> {
/// Opens a fast field given the bytes.
impl<C: FastFieldDataAccess + FastFieldCodecDeserializer + Clone> FastFieldCodecDeserializer
for GCDFastFieldCodec<C>
{
fn open_from_bytes(bytes: OwnedBytes) -> std::io::Result<Self> {
let footer_offset = bytes.len() - 16;
let footer_offset = bytes.len() - 24;
let (body, mut footer) = bytes.split(footer_offset);
let gcd = u64::deserialize(&mut footer)?;
let min_value = u64::deserialize(&mut footer)?;
let num_vals = u64::deserialize(&mut footer)?;
let reader = C::open_from_bytes(body)?;
Ok(GCDFastFieldCodec {
gcd,
min_value,
num_vals,
reader,
})
}
}

impl<C: FastFieldDataAccess + Clone> FastFieldDataAccess for GCDFastFieldCodec<C> {
#[inline]
fn get_u64(&self, doc: u64) -> u64 {
let mut data = self.reader.get_u64(doc);
fn get_val(&self, doc: u64) -> u64 {
let mut data = self.reader.get_val(doc);
data *= self.gcd;
data += self.min_value;
data
Expand All @@ -48,11 +54,20 @@ impl<C: FastFieldCodecReader + Clone> FastFieldCodecReader for GCDFastFieldCodec
fn max_value(&self) -> u64 {
self.min_value + self.reader.max_value() * self.gcd
}
fn num_vals(&self) -> u64 {
self.num_vals
}
}

pub fn write_gcd_header<W: Write>(field_write: &mut W, min_value: u64, gcd: u64) -> io::Result<()> {
pub fn write_gcd_header<W: Write>(
field_write: &mut W,
min_value: u64,
gcd: u64,
num_vals: u64,
) -> io::Result<()> {
gcd.serialize(field_write)?;
min_value.serialize(field_write)?;
num_vals.serialize(field_write)?;
Ok(())
}

Expand Down
Loading