diff --git a/fastfield_codecs/benches/bench.rs b/fastfield_codecs/benches/bench.rs index af6a58dffc..3a5ae58760 100644 --- a/fastfield_codecs/benches/bench.rs +++ b/fastfield_codecs/benches/bench.rs @@ -25,7 +25,10 @@ mod tests { fn value_iter() -> impl Iterator { 0..20_000 } - fn bench_get( + fn bench_get< + S: FastFieldCodecSerializer, + R: FastFieldCodecDeserializer + FastFieldDataAccess, + >( b: &mut Bencher, data: &[u64], ) { @@ -35,7 +38,7 @@ mod tests { b.iter(|| { let mut sum = 0u64; for pos in value_iter() { - let val = reader.get_u64(pos as u64); + let val = reader.get_val(pos as u64); debug_assert_eq!(data[pos as usize], val); sum = sum.wrapping_add(val); } diff --git a/fastfield_codecs/src/bitpacked.rs b/fastfield_codecs/src/bitpacked.rs index f336e44405..43e0ea838d 100644 --- a/fastfield_codecs/src/bitpacked.rs +++ b/fastfield_codecs/src/bitpacked.rs @@ -5,7 +5,7 @@ use ownedbytes::OwnedBytes; use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker}; use crate::{ - FastFieldCodecReader, FastFieldCodecSerializer, FastFieldCodecType, FastFieldDataAccess, + FastFieldCodecDeserializer, FastFieldCodecSerializer, FastFieldCodecType, FastFieldDataAccess, }; /// Depending on the field type, a different @@ -16,27 +16,32 @@ pub struct BitpackedReader { bit_unpacker: BitUnpacker, pub min_value_u64: u64, pub max_value_u64: u64, + pub num_vals: u64, } -impl FastFieldCodecReader for BitpackedReader { +impl FastFieldCodecDeserializer for BitpackedReader { /// Opens a fast field given a file. fn open_from_bytes(bytes: OwnedBytes) -> io::Result { - let footer_offset = bytes.len() - 16; + let footer_offset = bytes.len() - 24; let (data, mut footer) = bytes.split(footer_offset); let min_value = u64::deserialize(&mut footer)?; let amplitude = u64::deserialize(&mut footer)?; + let num_vals = u64::deserialize(&mut footer)?; let max_value = min_value + amplitude; let num_bits = compute_num_bits(amplitude); let bit_unpacker = BitUnpacker::new(num_bits); Ok(BitpackedReader { data, + bit_unpacker, min_value_u64: min_value, max_value_u64: max_value, - bit_unpacker, + num_vals, }) } +} +impl FastFieldDataAccess for BitpackedReader { #[inline] - fn get_u64(&self, doc: u64) -> u64 { + fn get_val(&self, doc: u64) -> u64 { self.min_value_u64 + self.bit_unpacker.get(doc, &self.data) } #[inline] @@ -47,11 +52,16 @@ impl FastFieldCodecReader for BitpackedReader { fn max_value(&self) -> u64 { self.max_value_u64 } + #[inline] + fn num_vals(&self) -> u64 { + self.num_vals + } } pub struct BitpackedSerializerLegacy<'a, W: 'a + Write> { bit_packer: BitPacker, write: &'a mut W, min_value: u64, + num_vals: u64, amplitude: u64, num_bits: u8, } @@ -78,6 +88,7 @@ impl<'a, W: Write> BitpackedSerializerLegacy<'a, W> { bit_packer, write, min_value, + num_vals: 0, amplitude, num_bits, }) @@ -88,12 +99,14 @@ impl<'a, W: Write> BitpackedSerializerLegacy<'a, W> { let val_to_write: u64 = val - self.min_value; self.bit_packer .write(val_to_write, self.num_bits, &mut self.write)?; + self.num_vals += 1; Ok(()) } pub fn close_field(mut self) -> io::Result<()> { self.bit_packer.close(&mut self.write)?; self.min_value.serialize(&mut self.write)?; self.amplitude.serialize(&mut self.write)?; + self.num_vals.serialize(&mut self.write)?; Ok(()) } } diff --git a/fastfield_codecs/src/blockwise_linear.rs b/fastfield_codecs/src/blockwise_linear.rs index 6f59763b53..7db3abc29e 100644 --- a/fastfield_codecs/src/blockwise_linear.rs +++ b/fastfield_codecs/src/blockwise_linear.rs @@ -19,7 +19,7 @@ use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker}; use crate::linear::{get_calculated_value, get_slope}; use crate::{ - FastFieldCodecReader, FastFieldCodecSerializer, FastFieldCodecType, FastFieldDataAccess, + FastFieldCodecDeserializer, FastFieldCodecSerializer, FastFieldCodecType, FastFieldDataAccess, }; const CHUNK_SIZE: u64 = 512; @@ -148,7 +148,7 @@ fn get_interpolation_function(doc: u64, interpolations: &[Function]) -> &Functio &interpolations[get_interpolation_position(doc)] } -impl FastFieldCodecReader for BlockwiseLinearReader { +impl FastFieldCodecDeserializer for BlockwiseLinearReader { /// Opens a fast field given a file. fn open_from_bytes(bytes: OwnedBytes) -> io::Result { let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?; @@ -157,9 +157,11 @@ impl FastFieldCodecReader for BlockwiseLinearReader { let footer = BlockwiseLinearFooter::deserialize(&mut footer)?; Ok(BlockwiseLinearReader { data, footer }) } +} +impl FastFieldDataAccess for BlockwiseLinearReader { #[inline] - fn get_u64(&self, idx: u64) -> u64 { + fn get_val(&self, idx: u64) -> u64 { let interpolation = get_interpolation_function(idx, &self.footer.interpolations); let in_block_idx = idx - interpolation.start_pos; let calculated_value = get_calculated_value( @@ -182,6 +184,10 @@ impl FastFieldCodecReader for BlockwiseLinearReader { fn max_value(&self) -> u64 { self.footer.max_value } + #[inline] + fn num_vals(&self) -> u64 { + self.footer.num_vals + } } /// Same as LinearSerializer, but working on chunks of CHUNK_SIZE elements. diff --git a/fastfield_codecs/src/lib.rs b/fastfield_codecs/src/lib.rs index e72844be4b..626a0686c3 100644 --- a/fastfield_codecs/src/lib.rs +++ b/fastfield_codecs/src/lib.rs @@ -12,12 +12,21 @@ pub mod bitpacked; pub mod blockwise_linear; pub mod linear; -pub trait FastFieldCodecReader: Sized { - /// reads the metadata and returns the CodecReader - fn open_from_bytes(bytes: OwnedBytes) -> std::io::Result; - fn get_u64(&self, doc: u64) -> u64; +pub trait FastFieldCodecDeserializer: Sized { + /// Reads the metadata and returns the CodecReader + fn open_from_bytes(bytes: OwnedBytes) -> std::io::Result + where Self: FastFieldDataAccess; +} + +pub trait FastFieldDataAccess { + fn get_val(&self, doc: u64) -> u64; fn min_value(&self) -> u64; fn max_value(&self) -> u64; + fn num_vals(&self) -> u64; + /// Returns a iterator over the data + fn iter<'a>(&'a self) -> Box + 'a> { + Box::new((0..self.num_vals()).map(|idx| self.get_val(idx))) + } } #[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)] @@ -85,31 +94,6 @@ pub trait FastFieldCodecSerializer { ) -> io::Result<()>; } -/// FastFieldDataAccess is the trait to access fast field data during serialization and estimation. -pub trait FastFieldDataAccess { - /// Return the value associated to the given position. - /// - /// Whenever possible use the Iterator passed to the fastfield creation instead, for performance - /// reasons. - /// - /// # Panics - /// - /// May panic if `position` is greater than the index. - fn get_val(&self, position: u64) -> u64; - - /// Returns a iterator over the data - fn iter(&self) -> Box + '_>; - - /// min value of the data - fn min_value(&self) -> u64; - - /// max value of the data - fn max_value(&self) -> u64; - - /// num vals - fn num_vals(&self) -> u64; -} - #[derive(Debug, Clone)] /// Statistics are used in codec detection and stored in the fast field footer. pub struct FastFieldStats { @@ -169,7 +153,10 @@ mod tests { use crate::blockwise_linear::{BlockwiseLinearReader, BlockwiseLinearSerializer}; use crate::linear::{LinearReader, LinearSerializer}; - pub fn create_and_validate( + pub fn create_and_validate< + S: FastFieldCodecSerializer, + R: FastFieldCodecDeserializer + FastFieldDataAccess, + >( data: &[u64], name: &str, ) -> (f32, f32) { @@ -183,8 +170,9 @@ mod tests { let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0); let reader = R::open_from_bytes(OwnedBytes::new(out)).unwrap(); + assert_eq!(reader.num_vals(), data.len() as u64); for (doc, orig_val) in data.iter().enumerate() { - let val = reader.get_u64(doc as u64); + let val = reader.get_val(doc as u64); if val != *orig_val { panic!( "val {val:?} does not match orig_val {orig_val:?}, in data set {name}, data \ @@ -228,7 +216,10 @@ mod tests { data_and_names } - fn test_codec() { + fn test_codec< + S: FastFieldCodecSerializer, + R: FastFieldDataAccess + FastFieldCodecDeserializer, + >() { let codec_name = format!("{:?}", S::CODEC_TYPE); for (data, dataset_name) in get_codec_test_data_sets() { let (estimate, actual) = crate::tests::create_and_validate::(&data, dataset_name); diff --git a/fastfield_codecs/src/linear.rs b/fastfield_codecs/src/linear.rs index a790ca3be4..bf50f7f1b6 100644 --- a/fastfield_codecs/src/linear.rs +++ b/fastfield_codecs/src/linear.rs @@ -6,7 +6,7 @@ use ownedbytes::OwnedBytes; use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker}; use crate::{ - FastFieldCodecReader, FastFieldCodecSerializer, FastFieldCodecType, FastFieldDataAccess, + FastFieldCodecDeserializer, FastFieldCodecSerializer, FastFieldCodecType, FastFieldDataAccess, }; /// Depending on the field type, a different @@ -59,7 +59,7 @@ impl FixedSize for LinearFooter { const SIZE_IN_BYTES: usize = 56; } -impl FastFieldCodecReader for LinearReader { +impl FastFieldCodecDeserializer for LinearReader { /// Opens a fast field given a file. fn open_from_bytes(bytes: OwnedBytes) -> io::Result { let footer_offset = bytes.len() - LinearFooter::SIZE_IN_BYTES; @@ -75,8 +75,11 @@ impl FastFieldCodecReader for LinearReader { slope, }) } +} + +impl FastFieldDataAccess for LinearReader { #[inline] - fn get_u64(&self, doc: u64) -> u64 { + fn get_val(&self, doc: u64) -> u64 { let calculated_value = get_calculated_value(self.footer.first_val, doc, self.slope); (calculated_value + self.bit_unpacker.get(doc, &self.data)) - self.footer.offset } @@ -89,6 +92,10 @@ impl FastFieldCodecReader for LinearReader { fn max_value(&self) -> u64 { self.footer.max_value } + #[inline] + fn num_vals(&self) -> u64 { + self.footer.num_vals + } } /// Fastfield serializer, which tries to guess values by linear interpolation diff --git a/src/fastfield/gcd.rs b/src/fastfield/gcd.rs index 50cb2594a8..8e706d12ff 100644 --- a/src/fastfield/gcd.rs +++ b/src/fastfield/gcd.rs @@ -3,7 +3,7 @@ use std::num::NonZeroU64; use common::BinarySerializable; use fastdivide::DividerU64; -use fastfield_codecs::FastFieldCodecReader; +use fastfield_codecs::{FastFieldCodecDeserializer, FastFieldDataAccess}; use ownedbytes::OwnedBytes; pub const GCD_DEFAULT: u64 = 1; @@ -15,27 +15,33 @@ pub const GCD_DEFAULT: u64 = 1; pub struct GCDFastFieldCodec { gcd: u64, min_value: u64, + num_vals: u64, reader: CodecReader, } -impl FastFieldCodecReader for GCDFastFieldCodec { - /// Opens a fast field given the bytes. +impl FastFieldCodecDeserializer + for GCDFastFieldCodec +{ fn open_from_bytes(bytes: OwnedBytes) -> std::io::Result { - let footer_offset = bytes.len() - 16; + let footer_offset = bytes.len() - 24; let (body, mut footer) = bytes.split(footer_offset); let gcd = u64::deserialize(&mut footer)?; let min_value = u64::deserialize(&mut footer)?; + let num_vals = u64::deserialize(&mut footer)?; let reader = C::open_from_bytes(body)?; Ok(GCDFastFieldCodec { gcd, min_value, + num_vals, reader, }) } +} +impl FastFieldDataAccess for GCDFastFieldCodec { #[inline] - fn get_u64(&self, doc: u64) -> u64 { - let mut data = self.reader.get_u64(doc); + fn get_val(&self, doc: u64) -> u64 { + let mut data = self.reader.get_val(doc); data *= self.gcd; data += self.min_value; data @@ -48,11 +54,20 @@ impl FastFieldCodecReader for GCDFastFieldCodec fn max_value(&self) -> u64 { self.min_value + self.reader.max_value() * self.gcd } + fn num_vals(&self) -> u64 { + self.num_vals + } } -pub fn write_gcd_header(field_write: &mut W, min_value: u64, gcd: u64) -> io::Result<()> { +pub fn write_gcd_header( + field_write: &mut W, + min_value: u64, + gcd: u64, + num_vals: u64, +) -> io::Result<()> { gcd.serialize(field_write)?; min_value.serialize(field_write)?; + num_vals.serialize(field_write)?; Ok(()) } diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index d5cf0e2037..851d5df6a5 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -326,7 +326,7 @@ mod tests { serializer.close().unwrap(); } let file = directory.open_read(path).unwrap(); - assert_eq!(file.len(), 37); + assert_eq!(file.len(), 45); let composite_file = CompositeFile::open(&file)?; let file = composite_file.open_read(*FIELD).unwrap(); let fast_field_reader = DynamicFastFieldReader::::open(file)?; @@ -357,7 +357,7 @@ mod tests { serializer.close()?; } let file = directory.open_read(path)?; - assert_eq!(file.len(), 62); + assert_eq!(file.len(), 70); { let fast_fields_composite = CompositeFile::open(&file)?; let data = fast_fields_composite.open_read(*FIELD).unwrap(); @@ -393,7 +393,7 @@ mod tests { serializer.close().unwrap(); } let file = directory.open_read(path).unwrap(); - assert_eq!(file.len(), 35); + assert_eq!(file.len(), 43); { let fast_fields_composite = CompositeFile::open(&file).unwrap(); let data = fast_fields_composite.open_read(*FIELD).unwrap(); @@ -425,7 +425,7 @@ mod tests { serializer.close().unwrap(); } let file = directory.open_read(path).unwrap(); - assert_eq!(file.len(), 80043); + assert_eq!(file.len(), 80051); { let fast_fields_composite = CompositeFile::open(&file)?; let data = fast_fields_composite.open_read(*FIELD).unwrap(); @@ -896,7 +896,7 @@ mod tests { serializer.close().unwrap(); } let file = directory.open_read(path).unwrap(); - assert_eq!(file.len(), 36); + assert_eq!(file.len(), 44); let composite_file = CompositeFile::open(&file)?; let file = composite_file.open_read(field).unwrap(); let fast_field_reader = DynamicFastFieldReader::::open(file)?; @@ -932,7 +932,7 @@ mod tests { serializer.close().unwrap(); } let file = directory.open_read(path).unwrap(); - assert_eq!(file.len(), 48); + assert_eq!(file.len(), 56); let composite_file = CompositeFile::open(&file)?; let file = composite_file.open_read(field).unwrap(); let fast_field_reader = DynamicFastFieldReader::::open(file)?; @@ -966,7 +966,7 @@ mod tests { serializer.close().unwrap(); } let file = directory.open_read(path).unwrap(); - assert_eq!(file.len(), 35); + assert_eq!(file.len(), 43); let composite_file = CompositeFile::open(&file)?; let file = composite_file.open_read(field).unwrap(); let fast_field_reader = DynamicFastFieldReader::::open(file)?; diff --git a/src/fastfield/reader.rs b/src/fastfield/reader.rs index f3c8d28207..7afedf6f52 100644 --- a/src/fastfield/reader.rs +++ b/src/fastfield/reader.rs @@ -6,7 +6,7 @@ use common::BinarySerializable; use fastfield_codecs::bitpacked::BitpackedReader; use fastfield_codecs::blockwise_linear::BlockwiseLinearReader; use fastfield_codecs::linear::LinearReader; -use fastfield_codecs::{FastFieldCodecReader, FastFieldCodecType}; +use fastfield_codecs::{FastFieldCodecDeserializer, FastFieldCodecType, FastFieldDataAccess}; use super::{FastValue, GCDFastFieldCodec}; use crate::directory::{CompositeFile, Directory, FileSlice, OwnedBytes, RamDirectory, WritePtr}; @@ -199,7 +199,9 @@ pub struct FastFieldReaderCodecWrapper { _phantom: PhantomData, } -impl FastFieldReaderCodecWrapper { +impl + FastFieldReaderCodecWrapper +{ /// Opens a fast field given a file. pub fn open(file: FileSlice) -> crate::Result { let mut bytes = file.read_bytes()?; @@ -226,7 +228,7 @@ impl FastFieldReaderCodecWrapper Item { - let data = self.reader.get_u64(doc); + let data = self.reader.get_val(doc); Item::from_u64(data) } @@ -249,8 +251,8 @@ impl FastFieldReaderCodecWrapper FastFieldReader - for FastFieldReaderCodecWrapper +impl + FastFieldReader for FastFieldReaderCodecWrapper { /// Return the value associated to the given document. /// diff --git a/src/fastfield/serializer/mod.rs b/src/fastfield/serializer/mod.rs index 1655cd7daa..871a049787 100644 --- a/src/fastfield/serializer/mod.rs +++ b/src/fastfield/serializer/mod.rs @@ -189,7 +189,7 @@ impl CompositeFastFieldSerializer { field_write, fastfield_accessor, )?; - write_gcd_header(field_write, base_value, gcd)?; + write_gcd_header(field_write, base_value, gcd, num_vals)?; Ok(()) } diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index 0cef6a905a..6d0000e5c2 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -2,12 +2,13 @@ use std::collections::HashMap; use std::io; use common; +use fastfield_codecs::FastFieldDataAccess; use fnv::FnvHashMap; use tantivy_bitpacker::BlockedBitpacker; use super::multivalued::MultiValuedFastFieldWriter; use super::serializer::FastFieldStats; -use super::{FastFieldDataAccess, FastFieldType, FastValue}; +use super::{FastFieldType, FastValue}; use crate::fastfield::{BytesFastFieldWriter, CompositeFastFieldSerializer}; use crate::indexer::doc_id_mapping::DocIdMapping; use crate::postings::UnorderedTermId;