diff --git a/columnar/src/block_accessor.rs b/columnar/src/block_accessor.rs index d596805a1a..d5c0acd7e2 100644 --- a/columnar/src/block_accessor.rs +++ b/columnar/src/block_accessor.rs @@ -25,7 +25,7 @@ impl pub fn fetch_block_with_missing(&mut self, docs: &[u32], accessor: &Column, missing: T) { self.fetch_block(docs, accessor); // We can compare docid_cache with docs to find missing docs - if docs.len() != self.docid_cache.len() { + if docs.len() != self.docid_cache.len() || accessor.index.is_multivalue() { self.missing_docids_cache.clear(); find_missing_docs(docs, &self.docid_cache, |doc| { self.missing_docids_cache.push(doc); @@ -50,8 +50,12 @@ impl } } +/// Given two sorted lists of docids `docs` and `hits`, hits is a subset of `docs`. +/// Return all docs that are not in `hits`. fn find_missing_docs(docs: &[u32], hits: &[u32], mut callback: F) -where F: FnMut(u32) { +where + F: FnMut(u32), +{ let mut docs_iter = docs.iter(); let mut hits_iter = hits.iter(); diff --git a/columnar/src/column_index/mod.rs b/columnar/src/column_index/mod.rs index 1a0e9073ce..41865e67ae 100644 --- a/columnar/src/column_index/mod.rs +++ b/columnar/src/column_index/mod.rs @@ -37,6 +37,10 @@ impl From for ColumnIndex { } impl ColumnIndex { + #[inline] + pub fn is_multivalue(&self) -> bool { + matches!(self, ColumnIndex::Multivalued(_)) + } // Returns the cardinality of the column index. // // By convention, if the column contains no docs, we consider that it is diff --git a/src/aggregation/bucket/term_agg.rs b/src/aggregation/bucket/term_agg.rs index 4627ee27a6..f71fac023f 100644 --- a/src/aggregation/bucket/term_agg.rs +++ b/src/aggregation/bucket/term_agg.rs @@ -1584,6 +1584,95 @@ mod tests { Ok(()) } + #[test] + fn terms_aggregation_missing_multi_value() -> crate::Result<()> { + let mut schema_builder = Schema::builder(); + let text_field = schema_builder.add_text_field("text", FAST); + let id_field = schema_builder.add_text_field("id", FAST); + let index = Index::create_in_ram(schema_builder.build()); + { + let mut index_writer = index.writer_with_num_threads(1, 20_000_000)?; + index_writer.set_merge_policy(Box::new(NoMergePolicy)); + index_writer.add_document(doc!( + text_field => "Hello Hello", + text_field => "Hello Hello", + id_field => 1u64, + id_field => 1u64, + ))?; + // Missing + index_writer.add_document(doc!())?; + index_writer.add_document(doc!( + text_field => "Hello Hello", + ))?; + index_writer.add_document(doc!( + text_field => "Hello Hello", + ))?; + index_writer.commit()?; + // Empty segment special case + index_writer.add_document(doc!())?; + index_writer.commit()?; + // Full segment special case + index_writer.add_document(doc!( + text_field => "Hello Hello", + id_field => 1u64, + ))?; + index_writer.commit()?; + } + + let agg_req: Aggregations = serde_json::from_value(json!({ + "my_texts": { + "terms": { + "field": "text", + "missing": "Empty" + }, + }, + "my_texts2": { + "terms": { + "field": "text", + "missing": 1337 + }, + }, + "my_ids": { + "terms": { + "field": "id", + "missing": 1337 + }, + } + })) + .unwrap(); + + let res = exec_request_with_query(agg_req, &index, None)?; + + // text field + assert_eq!(res["my_texts"]["buckets"][0]["key"], "Hello Hello"); + assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 5); + assert_eq!(res["my_texts"]["buckets"][1]["key"], "Empty"); + assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 2); + assert_eq!( + res["my_texts"]["buckets"][2]["key"], + serde_json::Value::Null + ); + // text field with numner as missing fallback + assert_eq!(res["my_texts2"]["buckets"][0]["key"], "Hello Hello"); + assert_eq!(res["my_texts2"]["buckets"][0]["doc_count"], 5); + assert_eq!(res["my_texts2"]["buckets"][1]["key"], 1337.0); + assert_eq!(res["my_texts2"]["buckets"][1]["doc_count"], 2); + assert_eq!( + res["my_texts2"]["buckets"][2]["key"], + serde_json::Value::Null + ); + assert_eq!(res["my_texts"]["sum_other_doc_count"], 0); + assert_eq!(res["my_texts"]["doc_count_error_upper_bound"], 0); + + // id field + assert_eq!(res["my_ids"]["buckets"][0]["key"], 1337.0); + assert_eq!(res["my_ids"]["buckets"][0]["doc_count"], 4); + assert_eq!(res["my_ids"]["buckets"][1]["key"], 1.0); + assert_eq!(res["my_ids"]["buckets"][1]["doc_count"], 3); + assert_eq!(res["my_ids"]["buckets"][2]["key"], serde_json::Value::Null); + + Ok(()) + } #[test] fn terms_aggregation_missing1() -> crate::Result<()> {