Skip to content

Commit

Permalink
Removing Deserializer trait (#1489)
Browse files Browse the repository at this point in the history
Removing Deserializer trait and renaming the `Serializer` trait `FastFieldCodec`.
Small refactoring estimate.
  • Loading branch information
fulmicoton authored Aug 27, 2022
1 parent 0dd6216 commit 54cfd0d
Show file tree
Hide file tree
Showing 10 changed files with 291 additions and 340 deletions.
37 changes: 16 additions & 21 deletions fastfield_codecs/benches/bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@ extern crate test;

#[cfg(test)]
mod tests {
use fastfield_codecs::bitpacked::{BitpackedReader, BitpackedSerializer};
use fastfield_codecs::blockwise_linear::{BlockwiseLinearReader, BlockwiseLinearSerializer};
use fastfield_codecs::linear::{LinearReader, LinearSerializer};
use fastfield_codecs::bitpacked::BitpackedCodec;
use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec;
use fastfield_codecs::linear::LinearCodec;
use fastfield_codecs::*;

fn get_data() -> Vec<u64> {
Expand All @@ -25,16 +25,10 @@ mod tests {
fn value_iter() -> impl Iterator<Item = u64> {
0..20_000
}
fn bench_get<
S: FastFieldCodecSerializer,
R: FastFieldCodecDeserializer + FastFieldDataAccess,
>(
b: &mut Bencher,
data: &[u64],
) {
fn bench_get<Codec: FastFieldCodec>(b: &mut Bencher, data: &[u64]) {
let mut bytes = vec![];
S::serialize(&mut bytes, &data).unwrap();
let reader = R::open_from_bytes(OwnedBytes::new(bytes)).unwrap();
Codec::serialize(&mut bytes, &data).unwrap();
let reader = Codec::open_from_bytes(OwnedBytes::new(bytes)).unwrap();
b.iter(|| {
let mut sum = 0u64;
for pos in value_iter() {
Expand All @@ -45,10 +39,11 @@ mod tests {
sum
});
}
fn bench_create<S: FastFieldCodecSerializer>(b: &mut Bencher, data: &[u64]) {
let mut bytes = vec![];
fn bench_create<Codec: FastFieldCodec>(b: &mut Bencher, data: &[u64]) {
let mut bytes = Vec::new();
b.iter(|| {
S::serialize(&mut bytes, &data).unwrap();
bytes.clear();
Codec::serialize(&mut bytes, &data).unwrap();
});
}

Expand All @@ -57,32 +52,32 @@ mod tests {
#[bench]
fn bench_fastfield_bitpack_create(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_create::<BitpackedSerializer>(b, &data);
bench_create::<BitpackedCodec>(b, &data);
}
#[bench]
fn bench_fastfield_linearinterpol_create(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_create::<LinearSerializer>(b, &data);
bench_create::<LinearCodec>(b, &data);
}
#[bench]
fn bench_fastfield_multilinearinterpol_create(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_create::<BlockwiseLinearSerializer>(b, &data);
bench_create::<BlockwiseLinearCodec>(b, &data);
}
#[bench]
fn bench_fastfield_bitpack_get(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_get::<BitpackedSerializer, BitpackedReader>(b, &data);
bench_get::<BitpackedCodec>(b, &data);
}
#[bench]
fn bench_fastfield_linearinterpol_get(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_get::<LinearSerializer, LinearReader>(b, &data);
bench_get::<LinearCodec>(b, &data);
}
#[bench]
fn bench_fastfield_multilinearinterpol_get(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_get::<BlockwiseLinearSerializer, BlockwiseLinearReader>(b, &data);
bench_get::<BlockwiseLinearCodec>(b, &data);
}
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
let min_value = data.iter().cloned().min().unwrap_or(0);
Expand Down
69 changes: 33 additions & 36 deletions fastfield_codecs/src/bitpacked.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,41 +4,19 @@ use common::BinarySerializable;
use ownedbytes::OwnedBytes;
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};

use crate::{
FastFieldCodecDeserializer, FastFieldCodecSerializer, FastFieldCodecType, FastFieldDataAccess,
};
use crate::{FastFieldCodec, FastFieldCodecType, FastFieldDataAccess};

/// Depending on the field type, a different
/// fast field is required.
#[derive(Clone)]
pub struct BitpackedReader {
data: OwnedBytes,
bit_unpacker: BitUnpacker,
pub min_value_u64: u64,
pub max_value_u64: u64,
pub num_vals: u64,
min_value_u64: u64,
max_value_u64: u64,
num_vals: u64,
}

impl FastFieldCodecDeserializer for BitpackedReader {
/// Opens a fast field given a file.
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self> {
let footer_offset = bytes.len() - 24;
let (data, mut footer) = bytes.split(footer_offset);
let min_value = u64::deserialize(&mut footer)?;
let amplitude = u64::deserialize(&mut footer)?;
let num_vals = u64::deserialize(&mut footer)?;
let max_value = min_value + amplitude;
let num_bits = compute_num_bits(amplitude);
let bit_unpacker = BitUnpacker::new(num_bits);
Ok(BitpackedReader {
data,
bit_unpacker,
min_value_u64: min_value,
max_value_u64: max_value,
num_vals,
})
}
}
impl FastFieldDataAccess for BitpackedReader {
#[inline]
fn get_val(&self, doc: u64) -> u64 {
Expand Down Expand Up @@ -111,12 +89,33 @@ impl<'a, W: Write> BitpackedSerializerLegacy<'a, W> {
}
}

pub struct BitpackedSerializer {}
pub struct BitpackedCodec;

impl FastFieldCodecSerializer for BitpackedSerializer {
impl FastFieldCodec for BitpackedCodec {
/// The CODEC_TYPE is an enum value used for serialization.
const CODEC_TYPE: FastFieldCodecType = FastFieldCodecType::Bitpacked;

type Reader = BitpackedReader;

/// Opens a fast field given a file.
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self::Reader> {
let footer_offset = bytes.len() - 24;
let (data, mut footer) = bytes.split(footer_offset);
let min_value = u64::deserialize(&mut footer)?;
let amplitude = u64::deserialize(&mut footer)?;
let num_vals = u64::deserialize(&mut footer)?;
let max_value = min_value + amplitude;
let num_bits = compute_num_bits(amplitude);
let bit_unpacker = BitUnpacker::new(num_bits);
Ok(BitpackedReader {
data,
bit_unpacker,
min_value_u64: min_value,
max_value_u64: max_value,
num_vals,
})
}

/// Serializes data with the BitpackedFastFieldSerializer.
///
/// The serializer in fact encode the values by bitpacking
Expand All @@ -142,29 +141,27 @@ impl FastFieldCodecSerializer for BitpackedSerializer {

Ok(())
}
fn is_applicable(_fastfield_accessor: &impl FastFieldDataAccess) -> bool {
true
}
fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32 {

fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> Option<f32> {
let amplitude = fastfield_accessor.max_value() - fastfield_accessor.min_value();
let num_bits = compute_num_bits(amplitude);
let num_bits_uncompressed = 64;
num_bits as f32 / num_bits_uncompressed as f32
Some(num_bits as f32 / num_bits_uncompressed as f32)
}
}

#[cfg(test)]
mod tests {
use super::*;
use crate::tests::get_codec_test_data_sets;
use crate::tests::get_codec_test_datasets;

fn create_and_validate(data: &[u64], name: &str) {
crate::tests::create_and_validate::<BitpackedSerializer, BitpackedReader>(data, name);
crate::tests::create_and_validate::<BitpackedCodec>(data, name);
}

#[test]
fn test_with_codec_data_sets() {
let data_sets = get_codec_test_data_sets();
let data_sets = get_codec_test_datasets();
for (mut data, name) in data_sets {
create_and_validate(&data, name);
data.reverse();
Expand Down
66 changes: 31 additions & 35 deletions fastfield_codecs/src/blockwise_linear.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,7 @@ use ownedbytes::OwnedBytes;
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};

use crate::linear::{get_calculated_value, get_slope};
use crate::{
FastFieldCodecDeserializer, FastFieldCodecSerializer, FastFieldCodecType, FastFieldDataAccess,
};
use crate::{FastFieldCodec, FastFieldCodecType, FastFieldDataAccess};

const CHUNK_SIZE: u64 = 512;

Expand Down Expand Up @@ -148,17 +146,6 @@ fn get_interpolation_function(doc: u64, interpolations: &[Function]) -> &Functio
&interpolations[get_interpolation_position(doc)]
}

impl FastFieldCodecDeserializer for BlockwiseLinearReader {
/// Opens a fast field given a file.
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self> {
let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?;
let footer_offset = bytes.len() - 4 - footer_len as usize;
let (data, mut footer) = bytes.split(footer_offset);
let footer = BlockwiseLinearFooter::deserialize(&mut footer)?;
Ok(BlockwiseLinearReader { data, footer })
}
}

impl FastFieldDataAccess for BlockwiseLinearReader {
#[inline]
fn get_val(&self, idx: u64) -> u64 {
Expand Down Expand Up @@ -191,10 +178,22 @@ impl FastFieldDataAccess for BlockwiseLinearReader {
}

/// Same as LinearSerializer, but working on chunks of CHUNK_SIZE elements.
pub struct BlockwiseLinearSerializer {}
pub struct BlockwiseLinearCodec;

impl FastFieldCodecSerializer for BlockwiseLinearSerializer {
impl FastFieldCodec for BlockwiseLinearCodec {
const CODEC_TYPE: FastFieldCodecType = FastFieldCodecType::BlockwiseLinear;

type Reader = BlockwiseLinearReader;

/// Opens a fast field given a file.
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self::Reader> {
let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?;
let footer_offset = bytes.len() - 4 - footer_len as usize;
let (data, mut footer) = bytes.split(footer_offset);
let footer = BlockwiseLinearFooter::deserialize(&mut footer)?;
Ok(BlockwiseLinearReader { data, footer })
}

/// Creates a new fast field serializer.
fn serialize(
write: &mut impl Write,
Expand Down Expand Up @@ -290,10 +289,14 @@ impl FastFieldCodecSerializer for BlockwiseLinearSerializer {
Ok(())
}

fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess) -> bool {
if fastfield_accessor.num_vals() < 5_000 {
return false;
/// estimation for linear interpolation is hard because, you don't know
/// where the local maxima are for the deviation of the calculated value and
/// the offset is also unknown.
fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> Option<f32> {
if fastfield_accessor.num_vals() < 10 * CHUNK_SIZE {
return None;
}

// On serialization the offset is added to the actual value.
// We need to make sure this won't run into overflow calculation issues.
// For this we take the maximum theroretical offset and add this to the max value.
Expand All @@ -305,14 +308,9 @@ impl FastFieldCodecSerializer for BlockwiseLinearSerializer {
.checked_add(theorethical_maximum_offset)
.is_none()
{
return false;
return None;
}
true
}
/// estimation for linear interpolation is hard because, you don't know
/// where the local maxima are for the deviation of the calculated value and
/// the offset is also unknown.
fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32 {

let first_val_in_first_block = fastfield_accessor.get_val(0);
let last_elem_in_first_chunk = CHUNK_SIZE.min(fastfield_accessor.num_vals());
let last_val_in_first_block =
Expand Down Expand Up @@ -351,7 +349,7 @@ impl FastFieldCodecSerializer for BlockwiseLinearSerializer {
// function metadata per block
+ 29 * (fastfield_accessor.num_vals() / CHUNK_SIZE);
let num_bits_uncompressed = 64 * fastfield_accessor.num_vals();
num_bits as f32 / num_bits_uncompressed as f32
Some(num_bits as f32 / num_bits_uncompressed as f32)
}
}

Expand All @@ -366,12 +364,10 @@ fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
#[cfg(test)]
mod tests {
use super::*;
use crate::tests::get_codec_test_data_sets;
use crate::tests::get_codec_test_datasets;

fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) {
crate::tests::create_and_validate::<BlockwiseLinearSerializer, BlockwiseLinearReader>(
data, name,
)
fn create_and_validate(data: &[u64], name: &str) -> Option<(f32, f32)> {
crate::tests::create_and_validate::<BlockwiseLinearCodec>(data, name)
}

const HIGHEST_BIT: u64 = 1 << 63;
Expand All @@ -385,7 +381,7 @@ mod tests {
.map(i64_to_u64)
.collect::<Vec<_>>();
let (estimate, actual_compression) =
create_and_validate(&data, "simple monotonically large i64");
create_and_validate(&data, "simple monotonically large i64").unwrap();
assert!(actual_compression < 0.2);
assert!(estimate < 0.20);
assert!(estimate > 0.15);
Expand All @@ -396,7 +392,7 @@ mod tests {
fn test_compression() {
let data = (10..=6_000_u64).collect::<Vec<_>>();
let (estimate, actual_compression) =
create_and_validate(&data, "simple monotonically large");
create_and_validate(&data, "simple monotonically large").unwrap();
assert!(actual_compression < 0.2);
assert!(estimate < 0.20);
assert!(estimate > 0.15);
Expand All @@ -405,7 +401,7 @@ mod tests {

#[test]
fn test_with_codec_data_sets() {
let data_sets = get_codec_test_data_sets();
let data_sets = get_codec_test_datasets();
for (mut data, name) in data_sets {
create_and_validate(&data, name);
data.reverse();
Expand Down
Loading

0 comments on commit 54cfd0d

Please sign in to comment.