Skip to content
This repository has been archived by the owner on Apr 4, 2023. It is now read-only.

Store fuzzy/bucketed positions in word_position_docids database #746

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 68 additions & 0 deletions milli/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,27 @@ pub fn absolute_from_relative_position(field_id: FieldId, relative: RelativePosi
(field_id as u32) << 16 | (relative as u32)
}

/// Compute the "bucketed" absolute position from the field id and relative position in the field.
///
/// In a bucketed position, the accuracy of the relative position is reduced exponentially as it gets larger.
pub fn bucketed_absolute_from_relative_position(
field_id: FieldId,
relative: RelativePosition,
) -> Position {
// The first few relative positions are kept intact.
if relative < 16 {
absolute_from_relative_position(field_id, relative)
} else if relative < 24 {
// Relative positions between 16 and 24 all become equal to 24
absolute_from_relative_position(field_id, 24)
} else {
// Then, groups of positions that have the same base-2 logarithm are reduced to
// the same relative position: the smallest power of 2 that is greater than them
let relative = (relative as f64).log2().ceil().exp2() as u16;
absolute_from_relative_position(field_id, relative)
}
}

/// Transform a raw obkv store into a JSON Object.
pub fn obkv_to_json(
displayed_fields: &[FieldId],
Expand Down Expand Up @@ -329,4 +350,51 @@ mod tests {

assert_eq!(&actual, expected);
}

#[test]
fn bucketed_position() {
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 0), @"0");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 1), @"1");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 2), @"2");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 15), @"15");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 16), @"24");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 19), @"24");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 20), @"24");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 21), @"24");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 22), @"24");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 23), @"24");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 24), @"32");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 25), @"32");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 30), @"32");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 40), @"64");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 50), @"64");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 60), @"64");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 70), @"128");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 80), @"128");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 90), @"128");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 100), @"128");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 110), @"128");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 120), @"128");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 130), @"256");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 1000), @"1024");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 2000), @"2048");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 4000), @"4096");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 8000), @"8192");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 9000), @"16384");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 10_000), @"16384");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 65_500), @"65535");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 65_535), @"65535");

insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(1, 0), @"65536");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(1, 1), @"65537");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(1, 20), @"65560");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(1, 1000), @"66560");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(1, 65_535), @"131071");

insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(2, 0), @"131072");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(2, 65_535), @"196607");

insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(65_535, 0), @"4294901760");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(65_535, 65_535), @"4294967295");
}
}
18 changes: 12 additions & 6 deletions milli/src/search/criteria/exactness.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ use crate::search::criteria::{
InitialCandidates,
};
use crate::search::query_tree::{Operation, PrimitiveQueryPart};
use crate::{absolute_from_relative_position, FieldId, Result};
use crate::{
absolute_from_relative_position, bucketed_absolute_from_relative_position, FieldId, Result,
};

pub struct Exactness<'t> {
ctx: &'t dyn Context<'t>,
Expand Down Expand Up @@ -285,30 +287,34 @@ fn attribute_start_with_docids(
) -> heed::Result<Vec<RoaringBitmap>> {
let mut attribute_candidates_array = Vec::new();
// start from attribute first position
let mut pos = absolute_from_relative_position(attribute_id, 0);
let mut relative_pos = 0;
for part in query {
use ExactQueryPart::*;
match part {
Synonyms(synonyms) => {
let bucketed_position =
bucketed_absolute_from_relative_position(attribute_id, relative_pos);
let mut synonyms_candidates = RoaringBitmap::new();
for word in synonyms {
let wc = ctx.word_position_docids(word, pos)?;
let wc = ctx.word_position_docids(word, bucketed_position)?;
if let Some(word_candidates) = wc {
synonyms_candidates |= word_candidates;
}
}
attribute_candidates_array.push(synonyms_candidates);
pos += 1;
relative_pos += 1;
}
Phrase(phrase) => {
for word in phrase {
let bucketed_position =
bucketed_absolute_from_relative_position(attribute_id, relative_pos);
if let Some(word) = word {
let wc = ctx.word_position_docids(word, pos)?;
let wc = ctx.word_position_docids(word, bucketed_position)?;
if let Some(word_candidates) = wc {
attribute_candidates_array.push(word_candidates);
}
}
pos += 1;
relative_pos += 1;
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@ use super::helpers::{
};
use crate::error::SerializationError;
use crate::index::db_name::DOCID_WORD_POSITIONS;
use crate::{DocumentId, Result};
use crate::{
bucketed_absolute_from_relative_position, relative_from_absolute_position, DocumentId, Result,
};

/// Extracts the word positions and the documents ids where this word appear.
///
Expand Down Expand Up @@ -37,9 +39,12 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(
let document_id = DocumentId::from_be_bytes(document_id_bytes);

for position in read_u32_ne_bytes(value) {
let (field_id, relative) = relative_from_absolute_position(position);
let bucketed_position = bucketed_absolute_from_relative_position(field_id, relative);

key_buffer.clear();
key_buffer.extend_from_slice(word_bytes);
key_buffer.extend_from_slice(&position.to_be_bytes());
key_buffer.extend_from_slice(&bucketed_position.to_be_bytes());

word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
}
Expand Down