paradedb · neilyio · Nov 26, 2024 · Nov 26, 2024 · Dec 3, 2024 · Dec 4, 2024
diff --git a/src/block_join_collector.rs b/src/block_join_collector.rs
@@ -0,0 +1,113 @@
+use crate::collector::Collector;
+use crate::query::Scorer;
+use crate::DocId;
+use crate::Result;
+use crate::Score;
+use crate::SegmentReader;
+use common::BitSet;
+
+/// A conceptual `BlockJoinCollector` that aims to mimic Lucene's BlockJoinCollector.
+/// It collects parent documents and, for each one, stores which child docs matched.
+/// After search, you can retrieve these "groups".
+///
+/// NOTE: This is a conceptual implementation. Adjust as per Tantivy's Collector API.
+/// In Tantivy, you'd typically implement `Collector` and `SegmentCollector`.
+pub struct BlockJoinCollector {
+    // For simplicity, store doc groups in memory:
+    groups: Vec<(DocId, Vec<DocId>, Vec<Score>)>,
+    current_reader_base: DocId,
+}
+
+impl BlockJoinCollector {
+    pub fn new() -> BlockJoinCollector {
+        BlockJoinCollector {
+            groups: Vec::new(),
+            current_reader_base: 0,
+        }
+    }
+
+    /// Retrieve the collected groups:
+    pub fn get_groups(&self) -> &[(DocId, Vec<DocId>, Vec<Score>)] {
+        &self.groups
+    }
+}
+
+impl Collector for BlockJoinCollector {
+    type Fruit = ();
+
+    fn set_segment(
+        &mut self,
+        _segment_id: u32,
+        reader: &SegmentReader,
+    ) -> Result<Box<dyn crate::collector::SegmentCollector<Fruit = ()>>> {
+        let base = self.current_reader_base;
+        self.current_reader_base += reader.max_doc();
+        let mut parent_bitset = BitSet::with_max_value(reader.max_doc());
+        // In a real scenario, you'd identify the parent docs here using a filter.
+        // For this conceptual example, we assume parents are known externally.
+        // You might need to pass that information in or have a filter pre-applied.
+
+        Ok(Box::new(BlockJoinSegmentCollector {
+            parent_bitset,
+            parent_groups: &mut self.groups,
+            base,
+        }))
+    }
+
+    fn requires_scoring(&self) -> bool {
+        true
+    }
+
+    fn collect(&mut self, _doc: DocId, _score: Score) -> Result<()> {
+        // This method won't be called directly if we rely on segment collectors.
+        Ok(())
+    }
+
+    fn harvest(self) -> Result<Self::Fruit> {
+        Ok(())
+    }
+}
+
+struct BlockJoinSegmentCollector<'a> {
+    parent_bitset: BitSet,
+    parent_groups: &'a mut Vec<(DocId, Vec<DocId>, Vec<Score>)>,
+    base: DocId,
+}
+
+impl<'a> crate::collector::SegmentCollector for BlockJoinSegmentCollector<'a> {
+    type Fruit = ();
+
+    fn collect(&mut self, doc: DocId, score: Score) {
+        // In a more complete implementation, you'd need
+        // logic to detect transitions from child docs to parent doc.
+        //
+        // This is a simplified conceptual collector. In practice:
+        // 1. Identify if `doc` is a parent or child.
+        // 2. If child, associate with last-seen parent.
+        // 3. If parent, start a new group.
+
+        // Without full integration it's hard to do. For now,
+        // assume that the scoring and doc iteration are done by
+        // BlockJoinScorer and that we only collect parents when
+        // we hit them:
+        if self.parent_bitset.contains(doc) {
+            // It's a parent doc
+            self.parent_groups
+                .push((self.base + doc, Vec::new(), Vec::new()));
+        } else {
+            // It's a child doc - associate it with last parent
+            if let Some(last) = self.parent_groups.last_mut() {
+                last.1.push(self.base + doc);
+                last.2.push(score);
+            }
+        }
+    }
+
+    fn set_scorer(&mut self, _scorer: Box<dyn Scorer>) {
+        // Not implemented - you'd store the scorer if needed.
+    }
+
+    fn harvest(self) -> Result<Self::Fruit> {
+        Ok(())
+    }
+}
diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs
@@ -714,6 +714,28 @@ impl<D: Document> IndexWriter<D> {
         Ok(opstamp)
     }
 
+    /// Adds multiple documents as a block.
+    ///
+    /// This method allows adding multiple documents together as a single block.
+    /// This is important for nested documents, where child documents need to be
+    /// added before their parent document, and they need to be stored together
+    /// in the same block.
+    ///
+    /// The opstamp returned is the opstamp of the last document added.
+    pub fn add_documents(&self, documents: Vec<D>) -> crate::Result<Opstamp> {
+        let count = documents.len() as u64;
+        if count == 0 {
+            return Ok(self.stamper.stamp());
+        }
+        let (batch_opstamp, stamps) = self.get_batch_opstamps(count);
+        let mut adds = AddBatch::default();
+        for (document, opstamp) in documents.into_iter().zip(stamps) {
+            adds.push(AddOperation { opstamp, document });
+        }
+        self.send_add_documents_batch(adds)?;
+        Ok(batch_opstamp)
+    }
+
     /// Gets a range of stamps from the stamper and "pops" the last stamp
     /// from the range returning a tuple of the last optstamp and the popped
     /// range.
@@ -820,6 +842,7 @@ mod tests {
         STRING, TEXT,
     };
     use crate::store::DOCSTORE_CACHE_CAPACITY;
+    use crate::Result;
     use crate::{
         DateTime, DocAddress, Index, IndexSettings, IndexWriter, ReloadPolicy, TantivyDocument,
         Term,
@@ -1234,6 +1257,188 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn test_add_documents() -> Result<()> {
+        // Create a simple schema with one text field
+        let mut schema_builder = Schema::builder();
+        let text_field = schema_builder.add_text_field("text", TEXT);
+        let schema = schema_builder.build();
+
+        // Create an index in RAM
+        let index = Index::create_in_ram(schema);
+        let mut index_writer = index.writer_for_tests()?;
+
+        // Create multiple documents
+        let docs = vec![
+            doc!(text_field => "hello"),
+            doc!(text_field => "world"),
+            doc!(text_field => "tantivy"),
+        ];
+
+        // Add documents using add_documents
+        let opstamp = index_writer.add_documents(docs)?;
+        assert_eq!(opstamp, 3u64); // Since we have three documents, opstamp should be 3
+
+        // Commit the changes
+        index_writer.commit()?;
+
+        // Create a reader and searcher
+        let reader = index.reader()?;
+        reader.reload()?;
+        let searcher = reader.searcher();
+
+        // Verify that the documents are indexed correctly
+        let term = Term::from_field_text(text_field, "hello");
+        let query = TermQuery::new(term, IndexRecordOption::Basic);
+        let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
+        assert_eq!(top_docs.len(), 1);
+
+        let term = Term::from_field_text(text_field, "world");
+        let query = TermQuery::new(term, IndexRecordOption::Basic);
+        let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
+        assert_eq!(top_docs.len(), 1);
+
+        let term = Term::from_field_text(text_field, "tantivy");
+        let query = TermQuery::new(term, IndexRecordOption::Basic);
+        let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
+        assert_eq!(top_docs.len(), 1);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_add_documents_empty() -> Result<()> {
+        // Test adding an empty list of documents
+        let mut schema_builder = Schema::builder();
+        let text_field = schema_builder.add_text_field("text", TEXT);
+        let schema = schema_builder.build();
+
+        let index = Index::create_in_ram(schema);
+        let mut index_writer = index.writer_for_tests()?;
+
+        let docs: Vec<TantivyDocument> = Vec::new();
+        let opstamp = index_writer.add_documents(docs)?;
+        assert_eq!(opstamp, 0u64);
+
+        // Since no documents were added, committing should not change anything
+        index_writer.commit()?;
+
+        let reader = index.reader()?;
+        reader.reload()?;
+        let searcher = reader.searcher();
+
+        // Search for any documents, expecting none
+        let term = Term::from_field_text(text_field, "any");
+        let query = TermQuery::new(term, IndexRecordOption::Basic);
+        let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
+        assert_eq!(top_docs.len(), 0);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_add_documents_order() -> Result<()> {
+        // Test that documents are indexed in the order they are added
+        let mut schema_builder = Schema::builder();
+        let text_field = schema_builder.add_text_field("text", TEXT | STORED);
+        let schema = schema_builder.build();
+
+        let index = Index::create_in_ram(schema);
+        let mut index_writer = index.writer_for_tests()?;
+
+        // Create multiple documents
+        let docs = vec![
+            doc!(text_field => "doc1"),
+            doc!(text_field => "doc2"),
+            doc!(text_field => "doc3"),
+        ];
+
+        // Add documents using add_documents
+        index_writer.add_documents(docs)?;
+        index_writer.commit()?;
+
+        // Create a reader and searcher
+        let reader = index.reader()?;
+        reader.reload()?;
+        let searcher = reader.searcher();
+
+        // Collect documents and verify their order
+        let all_docs = searcher
+            .segment_readers()
+            .iter()
+            .flat_map(|segment_reader| {
+                let store_reader = segment_reader.get_store_reader(1000).unwrap();
+                segment_reader
+                    .doc_ids_alive()
+                    .map(move |doc_id| store_reader.get::<TantivyDocument>(doc_id).unwrap())
+            })
+            .collect::<Vec<_>>();
+
+        assert_eq!(all_docs.len(), 3);
+        assert_eq!(
+            all_docs[0].get_first(text_field).unwrap().as_str(),
+            Some("doc1")
+        );
+        assert_eq!(
+            all_docs[1].get_first(text_field).unwrap().as_str(),
+            Some("doc2")
+        );
+        assert_eq!(
+            all_docs[2].get_first(text_field).unwrap().as_str(),
+            Some("doc3")
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_add_documents_concurrency() -> Result<()> {
+        // Test adding documents concurrently
+        use std::sync::mpsc;
+        use std::thread;
+
+        let mut schema_builder = Schema::builder();
+        let text_field = schema_builder.add_text_field("text", TEXT);
+        let schema = schema_builder.build();
+
+        let index = Index::create_in_ram(schema);
+        let mut index_writer = index.writer_for_tests()?;
+
+        // Create a channel to send documents to the indexer
+        let (doc_sender, doc_receiver) = mpsc::channel();
+
+        // Spawn a thread to add documents
+        let sender_clone = doc_sender.clone();
+        let handle = thread::spawn(move || {
+            let docs = vec![doc!(text_field => "threaded")];
+            for doc in docs {
+                sender_clone.send(doc).unwrap();
+            }
+        });
+
+        // Drop the extra sender to close the channel when done
+        drop(doc_sender);
+
+        // Indexer thread
+        for doc in doc_receiver {
+            index_writer.add_document(doc)?;
+        }
+
+        index_writer.commit()?;
+        handle.join().unwrap();
+
+        let reader = index.reader()?;
+        reader.reload()?;
+        let searcher = reader.searcher();
+
+        let term = Term::from_field_text(text_field, "threaded");
+        let query = TermQuery::new(term, IndexRecordOption::Basic);
+        let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
+        assert_eq!(top_docs.len(), 1);
+
+        Ok(())
+    }
+
     #[test]
     fn test_add_then_delete_all_documents() {
         let mut schema_builder = schema::Schema::builder();

diff --git a/src/lib.rs b/src/lib.rs
@@ -1,3 +1,4 @@
+#![allow(warnings)]
 #![doc(html_logo_url = "http://fulmicoton.com/tantivy-logo/tantivy-logo.png")]
 #![cfg_attr(all(feature = "unstable", test), feature(test))]
 #![doc(test(attr(allow(unused_variables), deny(warnings))))]