Skip to content
This repository has been archived by the owner on Apr 4, 2023. It is now read-only.

introduce the roaring multiop in milli #581

Draft
wants to merge 25 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion benchmarks/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ serde_json = { version = "1.0.79", features = ["preserve_order"] }
criterion = { version = "0.3.5", features = ["html_reports"] }
rand = "0.8.5"
rand_chacha = "0.3.1"
roaring = "0.9.0"
roaring = { git = "https://github.com/RoaringBitmap/roaring-rs", branch = "main" }

[build-dependencies]
anyhow = "1.0.56"
Expand Down
3 changes: 2 additions & 1 deletion infos/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ byte-unit = { version = "4.0.14", default-features = false, features = ["std"] }
csv = "1.1.6"
milli = { path = "../milli" }
mimalloc = { version = "0.1.29", default-features = false }
roaring = "0.9.0"
# roaring = "0.9.0"
roaring = { git = "https://github.com/RoaringBitmap/roaring-rs", branch = "main" }
serde_json = "1.0.79"
stderrlog = "0.5.1"
structopt = { version = "0.3.26", default-features = false }
4 changes: 3 additions & 1 deletion milli/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ edition = "2018"
[dependencies]
bimap = { version = "0.6.2", features = ["serde"] }
bincode = "1.3.3"
bytemuck = { version = "1.12.1", features = ["extern_crate_alloc"] }
bstr = "0.2.17"
byteorder = "1.4.3"
charabia = "0.6.0"
Expand All @@ -26,7 +27,7 @@ obkv = "0.2.0"
once_cell = "1.10.0"
ordered-float = "2.10.0"
rayon = "1.5.1"
roaring = "0.9.0"
roaring = { git = "https://github.com/RoaringBitmap/roaring-rs", branch = "main" }
rstar = { version = "0.9.2", features = ["serde"] }
serde = { version = "1.0.136", features = ["derive"] }
serde_json = { version = "1.0.79", features = ["preserve_order"] }
Expand Down Expand Up @@ -55,6 +56,7 @@ insta = "1.18.1"
maplit = "1.0.2"
md5 = "0.7.0"
rand = "0.8.5"
select-rustc = "0.1"

[features]
default = []
122 changes: 97 additions & 25 deletions milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use std::io;
use std::mem::size_of;

use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt};
use roaring::RoaringBitmap;
use roaring::{MultiOps, RoaringBitmap};

/// This is the limit where using a byteorder became less size efficient
/// than using a direct roaring encoding, it is also the point where we are able
Expand Down Expand Up @@ -54,47 +54,71 @@ impl CboRoaringBitmapCodec {
}

/// Merge serialized CboRoaringBitmaps in a buffer.
/// The buffer MUST BE empty.
///
/// if the merged values length is under the threshold, values are directly
/// serialized in the buffer else a RoaringBitmap is created from the
/// values and is serialized in the buffer.
pub fn merge_into(slices: &[Cow<[u8]>], buffer: &mut Vec<u8>) -> io::Result<()> {
let mut roaring = RoaringBitmap::new();
let mut vec = Vec::new();

for bytes in slices {
if bytes.len() <= THRESHOLD * size_of::<u32>() {
let mut reader = bytes.as_ref();
while let Ok(integer) = reader.read_u32::<NativeEndian>() {
vec.push(integer);
debug_assert!(buffer.len() == 0);

let bitmaps = slices
.iter()
.filter_map(|slice| {
if slice.len() <= THRESHOLD * size_of::<u32>() {
buffer.extend(slice.as_ref());
None
} else {
RoaringBitmap::deserialize_from(slice.as_ref()).into()
}
} else {
roaring |= RoaringBitmap::deserialize_from(bytes.as_ref())?;
}
}
})
.collect::<io::Result<Vec<_>>>()?;

if roaring.is_empty() {
vec.sort_unstable();
vec.dedup();
let u32_buffer: &mut Vec<u32> = unsafe { convert_vec(buffer) };
u32_buffer.sort_unstable();
u32_buffer.dedup();

if vec.len() <= THRESHOLD {
for integer in vec {
buffer.extend_from_slice(&integer.to_ne_bytes());
}
} else {
// We can unwrap safely because the vector is sorted upper.
let roaring = RoaringBitmap::from_sorted_iter(vec.into_iter()).unwrap();
if bitmaps.is_empty() {
if u32_buffer.len() > THRESHOLD {
// We can unwrap safely because the vector is sorted above.
let roaring = RoaringBitmap::from_sorted_iter(u32_buffer.iter().copied()).unwrap();

let buffer: &mut Vec<u8> = unsafe { convert_vec(u32_buffer) };
buffer.clear();
roaring.serialize_into(buffer)?;
} else {
// we still need to fix the size of the buffer
let _buffer: &mut Vec<u8> = unsafe { convert_vec(u32_buffer) };
}
} else {
roaring.extend(vec);
roaring.serialize_into(buffer)?;
let bitmap = RoaringBitmap::from_sorted_iter(u32_buffer.iter().copied()).unwrap();
let buffer: &mut Vec<u8> = unsafe { convert_vec(u32_buffer) };
let bitmap = bitmaps.into_iter().chain(std::iter::once(bitmap)).union();
buffer.clear();
bitmap.serialize_into(buffer)?;
}

Ok(())
}
}

/// Convert a `Vec` of `T` into a `Vec` of `U` by keeping the same allocation and
/// only updating the size of the `Vec`.
/// To make this works `size_of::<T>() * input.len() % size_of::<U>()` must be equal to zero.
unsafe fn convert_vec<T, U>(input: &mut Vec<T>) -> &mut Vec<U> {
debug_assert!(
size_of::<T>() * input.len() % size_of::<U>() == 0,
"called with incompatible types"
);

let new_len = size_of::<T>() * input.len() / size_of::<U>();

let ret: &mut Vec<U> = std::mem::transmute(input);
ret.set_len(new_len);

ret
}

impl heed::BytesDecode<'_> for CboRoaringBitmapCodec {
type DItem = RoaringBitmap;

Expand Down Expand Up @@ -183,4 +207,52 @@ mod tests {
let expected = RoaringBitmap::from_sorted_iter(0..23).unwrap();
assert_eq!(bitmap, expected);
}

#[cfg(feature = "nightly")]
mod bench {
extern crate test;
use test::Bencher;

#[bench]
fn bench_small_merge_cbo_roaring_bitmaps(bencher: &mut Bencher) {
#[rustfmt::skip]
let inputs = [
vec![Cow::Owned(vec![255, 56, 14, 0]), Cow::Owned(vec![196, 43, 14, 0])],
vec![Cow::Owned(vec![63, 101, 3, 0]), Cow::Owned(vec![71, 136, 3, 0])],
vec![Cow::Owned(vec![68, 108, 0, 0]), Cow::Owned(vec![85, 104, 0, 0]), Cow::Owned(vec![204, 103, 0, 0])],
vec![Cow::Owned(vec![199, 101, 7, 0]), Cow::Owned(vec![94, 42, 7, 0])],
vec![Cow::Owned(vec![173, 219, 12, 0]), Cow::Owned(vec![146, 3, 13, 0])],
vec![Cow::Owned(vec![13, 152, 3, 0]), Cow::Owned(vec![64, 120, 3, 0])],
vec![Cow::Owned(vec![109, 253, 13, 0]), Cow::Owned(vec![108, 232, 13, 0])],
vec![Cow::Owned(vec![73, 176, 3, 0]), Cow::Owned(vec![126, 167, 3, 0])],
];

let mut vec = Vec::new();
for input in inputs {
bencher.iter(|| CboRoaringBitmapCodec::merge_into(&input, &mut vec));
vec.clear();
}
}

#[bench]
fn bench_medium_merge_cbo_roaring_bitmaps(bencher: &mut Bencher) {
#[rustfmt::skip]
let inputs = [
vec![Cow::Owned(vec![232, 35, 9, 0]), Cow::Owned(vec![192, 10, 9, 0]), Cow::Owned(vec![91, 33, 9, 0]), Cow::Owned(vec![204, 29, 9, 0])],
vec![Cow::Owned(vec![144, 39, 9, 0]), Cow::Owned(vec![162, 66, 9, 0]), Cow::Owned(vec![146, 11, 9, 0]), Cow::Owned(vec![174, 61, 9, 0])],
vec![Cow::Owned(vec![83, 70, 7, 0]), Cow::Owned(vec![115, 72, 7, 0]), Cow::Owned(vec![219, 54, 7, 0]), Cow::Owned(vec![1, 93, 7, 0]), Cow::Owned(vec![195, 77, 7, 0]), Cow::Owned(vec![21, 86, 7, 0])],
vec![Cow::Owned(vec![244, 112, 0, 0]), Cow::Owned(vec![48, 126, 0, 0]), Cow::Owned(vec![72, 142, 0, 0]), Cow::Owned(vec![255, 113, 0, 0]), Cow::Owned(vec![101, 114, 0, 0]), Cow::Owned(vec![66, 88, 0, 0]), Cow::Owned(vec![84, 92, 0, 0]), Cow::Owned(vec![194, 137, 0, 0]), Cow::Owned(vec![208, 132, 0, 0])],
vec![Cow::Owned(vec![8, 57, 7, 0]), Cow::Owned(vec![133, 115, 7, 0]), Cow::Owned(vec![219, 94, 7, 0]), Cow::Owned(vec![46, 95, 7, 0]), Cow::Owned(vec![156, 111, 7, 0]), Cow::Owned(vec![63, 107, 7, 0]), Cow::Owned(vec![31, 47, 7, 0])],
vec![Cow::Owned(vec![165, 78, 0, 0]), Cow::Owned(vec![197, 95, 0, 0]), Cow::Owned(vec![194, 82, 0, 0]), Cow::Owned(vec![142, 91, 0, 0]), Cow::Owned(vec![120, 94, 0, 0])],
vec![Cow::Owned(vec![185, 187, 13, 0]), Cow::Owned(vec![41, 187, 13, 0]), Cow::Owned(vec![245, 223, 13, 0]), Cow::Owned(vec![211, 251, 13, 0]), Cow::Owned(vec![192, 193, 13, 0]), Cow::Owned(vec![215, 230, 13, 0]), Cow::Owned(vec![252, 207, 13, 0]), Cow::Owned(vec![131, 213, 13, 0]), Cow::Owned(vec![219, 187, 13, 0]), Cow::Owned(vec![105, 236, 13, 0]), Cow::Owned(vec![30, 239, 13, 0]), Cow::Owned(vec![13, 200, 13, 0]), Cow::Owned(vec![111, 197, 13, 0]), Cow::Owned(vec![87, 222, 13, 0]), Cow::Owned(vec![7, 205, 13, 0]), Cow::Owned(vec![90, 211, 13, 0])],
vec![Cow::Owned(vec![215, 253, 13, 0]), Cow::Owned(vec![225, 194, 13, 0]), Cow::Owned(vec![37, 189, 13, 0]), Cow::Owned(vec![242, 212, 13, 0])],
];

let mut vec = Vec::new();
for input in inputs {
bencher.iter(|| CboRoaringBitmapCodec::merge_into(&input, &mut vec));
vec.clear();
}
}
}
}
2 changes: 2 additions & 0 deletions milli/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#![cfg_attr(feature = "nightly", feature(test))]

#[macro_use]
pub mod documents;

Expand Down
100 changes: 45 additions & 55 deletions milli/src/search/criteria/exactness.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
use std::convert::TryFrom;
use std::mem::take;
use std::ops::BitOr;

use itertools::Itertools;
use log::debug;
use roaring::RoaringBitmap;
use roaring::{MultiOps, RoaringBitmap};

use crate::search::criteria::{
resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult,
Expand Down Expand Up @@ -173,35 +172,41 @@ fn resolve_state(
use State::*;
match state {
ExactAttribute(mut allowed_candidates) => {
let mut candidates = RoaringBitmap::new();
if let Ok(query_len) = u8::try_from(query.len()) {
let attributes_ids = ctx.searchable_fields_ids()?;
for id in attributes_ids {
if let Some(attribute_allowed_docids) =
ctx.field_id_word_count_docids(id, query_len)?
{

let mut candidates = attributes_ids
.into_iter()
.filter_map(|id| {
ctx.field_id_word_count_docids(id, query_len)
.transpose()
.map(|res| (id, res))
})
.map(|(id, attribute_allowed_docids)| -> Result<_> {
let mut attribute_candidates_array =
attribute_start_with_docids(ctx, id, query)?;
attribute_candidates_array.push(attribute_allowed_docids);
candidates |= intersection_of(attribute_candidates_array.iter().collect());
}
}
attribute_candidates_array.push(attribute_allowed_docids?);
Ok(attribute_candidates_array.into_iter().intersection())
})
.union()?;

// only keep allowed candidates
candidates &= &allowed_candidates;
// remove current candidates from allowed candidates
allowed_candidates -= &candidates;
}

Ok((candidates, Some(AttributeStartsWith(allowed_candidates))))
Ok((candidates, Some(AttributeStartsWith(allowed_candidates))))
} else {
Ok((RoaringBitmap::new(), Some(AttributeStartsWith(allowed_candidates))))
}
}
AttributeStartsWith(mut allowed_candidates) => {
let mut candidates = RoaringBitmap::new();
let attributes_ids = ctx.searchable_fields_ids()?;
for id in attributes_ids {
let attribute_candidates_array = attribute_start_with_docids(ctx, id, query)?;
candidates |= intersection_of(attribute_candidates_array.iter().collect());
}

let mut candidates = attributes_ids
.into_iter()
.map(|id| attribute_start_with_docids(ctx, id, query).map(MultiOps::intersection))
.union()?;

// only keep allowed candidates
candidates &= &allowed_candidates;
Expand All @@ -218,27 +223,24 @@ fn resolve_state(
use ExactQueryPart::*;
match part {
Synonyms(synonyms) => {
for synonym in synonyms {
if let Some(synonym_candidates) = ctx.word_docids(synonym)? {
candidates |= synonym_candidates;
}
}
let tmp = synonyms
.into_iter()
.filter_map(|synonym| ctx.word_docids(synonym).transpose())
.union()?;

candidates |= tmp;
}
// compute intersection on pair of words with a proximity of 0.
Phrase(phrase) => {
let mut bitmaps = Vec::with_capacity(phrase.len().saturating_sub(1));
for words in phrase.windows(2) {
if let [left, right] = words {
match ctx.word_pair_proximity_docids(left, right, 0)? {
Some(docids) => bitmaps.push(docids),
None => {
bitmaps.clear();
break;
}
}
}
}
candidates |= intersection_of(bitmaps.iter().collect());
let bitmaps = phrase
.windows(2)
.map(|words| {
ctx.word_pair_proximity_docids(&words[0], &words[1], 0)
.map(|o| o.unwrap_or_default())
})
.intersection()?;

candidates |= bitmaps;
}
}
parts_candidates_array.push(candidates);
Expand All @@ -247,7 +249,7 @@ fn resolve_state(
let mut candidates_array = Vec::new();

// compute documents that contain all exact words.
let mut all_exact_candidates = intersection_of(parts_candidates_array.iter().collect());
let mut all_exact_candidates = parts_candidates_array.iter().intersection();
all_exact_candidates &= &allowed_candidates;
allowed_candidates -= &all_exact_candidates;

Expand All @@ -258,9 +260,9 @@ fn resolve_state(
// create all `c_count` combinations of exact words
.combinations(c_count)
// intersect each word candidates in combinations
.map(intersection_of)
.map(MultiOps::intersection)
// union combinations of `c_count` exact words
.fold(RoaringBitmap::new(), RoaringBitmap::bitor);
.union();
// only keep allowed candidates
combinations_candidates &= &allowed_candidates;
// remove current candidates from allowed candidates
Expand Down Expand Up @@ -299,13 +301,10 @@ fn attribute_start_with_docids(
use ExactQueryPart::*;
match part {
Synonyms(synonyms) => {
let mut synonyms_candidates = RoaringBitmap::new();
for word in synonyms {
let wc = ctx.word_position_docids(word, pos)?;
if let Some(word_candidates) = wc {
synonyms_candidates |= word_candidates;
}
}
let synonyms_candidates = synonyms
.into_iter()
.filter_map(|word| ctx.word_position_docids(word, pos).transpose())
.union()?;
attribute_candidates_array.push(synonyms_candidates);
pos += 1;
}
Expand All @@ -324,15 +323,6 @@ fn attribute_start_with_docids(
Ok(attribute_candidates_array)
}

fn intersection_of(mut rbs: Vec<&RoaringBitmap>) -> RoaringBitmap {
rbs.sort_unstable_by_key(|rb| rb.len());
let mut iter = rbs.into_iter();
match iter.next() {
Some(first) => iter.fold(first.clone(), |acc, rb| acc & rb),
None => RoaringBitmap::new(),
}
}

#[derive(Debug, Clone)]
pub enum ExactQueryPart {
Phrase(Vec<String>),
Expand Down
Loading