From 3cc038644b2b8d03274852562037ed8a9c6f227a Mon Sep 17 00:00:00 2001 From: Amna Mubashar Date: Mon, 12 Aug 2024 15:31:50 +0200 Subject: [PATCH] Revert back to SentenceWindowRetrieval --- docs/pydoc/config/retrievers_api.yml | 2 +- haystack/components/retrievers/__init__.py | 4 +- ...riever.py => sentence_window_retrieval.py} | 14 ++--- ...nce-window-retrieval-8f5721772b28377e.yaml | 4 ++ ...r.py => test_sentence_window_retrieval.py} | 52 +++++++++---------- 5 files changed, 40 insertions(+), 36 deletions(-) rename haystack/components/retrievers/{sentence_window_retriever.py => sentence_window_retrieval.py} (96%) create mode 100644 releasenotes/notes/revert-sentence-window-retrieval-8f5721772b28377e.yaml rename test/components/retrievers/{test_sentence_window_retriever.py => test_sentence_window_retrieval.py} (78%) diff --git a/docs/pydoc/config/retrievers_api.yml b/docs/pydoc/config/retrievers_api.yml index c7f0e8ebd1..8b957cfde5 100644 --- a/docs/pydoc/config/retrievers_api.yml +++ b/docs/pydoc/config/retrievers_api.yml @@ -6,7 +6,7 @@ loaders: "in_memory/bm25_retriever", "in_memory/embedding_retriever", "filter_retriever", - "sentence_window_retriever", + "sentence_window_retrieval", ] ignore_when_discovered: ["__init__"] processors: diff --git a/haystack/components/retrievers/__init__.py b/haystack/components/retrievers/__init__.py index 91d1288a19..e86e40fbca 100644 --- a/haystack/components/retrievers/__init__.py +++ b/haystack/components/retrievers/__init__.py @@ -5,6 +5,6 @@ from haystack.components.retrievers.filter_retriever import FilterRetriever from haystack.components.retrievers.in_memory.bm25_retriever import InMemoryBM25Retriever from haystack.components.retrievers.in_memory.embedding_retriever import InMemoryEmbeddingRetriever -from haystack.components.retrievers.sentence_window_retriever import SentenceWindowRetriever +from haystack.components.retrievers.sentence_window_retrieval import SentenceWindowRetrieval -__all__ = ["FilterRetriever", "InMemoryEmbeddingRetriever", "InMemoryBM25Retriever", "SentenceWindowRetriever"] +__all__ = ["FilterRetriever", "InMemoryEmbeddingRetriever", "InMemoryBM25Retriever", "SentenceWindowRetrieval"] diff --git a/haystack/components/retrievers/sentence_window_retriever.py b/haystack/components/retrievers/sentence_window_retrieval.py similarity index 96% rename from haystack/components/retrievers/sentence_window_retriever.py rename to haystack/components/retrievers/sentence_window_retrieval.py index 34af44f650..9f39af40ed 100644 --- a/haystack/components/retrievers/sentence_window_retriever.py +++ b/haystack/components/retrievers/sentence_window_retrieval.py @@ -10,19 +10,19 @@ @component -class SentenceWindowRetriever: +class SentenceWindowRetrieval: """ Retrieves documents adjacent to a given document in the Document Store. During indexing, documents are broken into smaller chunks, or sentences. When you submit a query, the Retriever fetches the most relevant sentence. To provide full context, - SentenceWindowRetriever fetches a number of neighboring sentences before and after each + SentenceWindowRetrieval fetches a number of neighboring sentences before and after each relevant one. You can set this number with the `window_size` parameter. It uses `source_id` and `doc.meta['split_id']` to locate the surrounding documents. This component works with existing Retrievers, like BM25Retriever or EmbeddingRetriever. First, use a Retriever to find documents based on a query and then use - SentenceWindowRetriever to get the surrounding documents for context. + SentenceWindowRetrieval to get the surrounding documents for context. ### Usage example @@ -30,7 +30,7 @@ class SentenceWindowRetriever: ```python from haystack import Document, Pipeline from haystack.components.retrievers.in_memory import InMemoryBM25Retriever - from haystack.components.retrievers import SentenceWindowRetriever + from haystack.components.retrievers import SentenceWindowRetrieval from haystack.components.preprocessors import DocumentSplitter from haystack.document_stores.in_memory import InMemoryDocumentStore @@ -47,7 +47,7 @@ class SentenceWindowRetriever: rag = Pipeline() rag.add_component("bm25_retriever", InMemoryBM25Retriever(doc_store, top_k=1)) - rag.add_component("sentence_window_retriever", SentenceWindowRetriever(document_store=doc_store, window_size=2)) + rag.add_component("sentence_window_retriever", SentenceWindowRetrieval(document_store=doc_store, window_size=2)) rag.connect("bm25_retriever", "sentence_window_retriever") rag.run({'bm25_retriever': {"query":"third"}}) @@ -60,7 +60,7 @@ class SentenceWindowRetriever: def __init__(self, document_store: DocumentStore, window_size: int = 3): """ - Creates a new SentenceWindowRetriever component. + Creates a new SentenceWindowRetrieval component. :param document_store: The Document Store to retrieve the surrounding documents from. :param window_size: The number of documents to retrieve before and after the relevant one. @@ -110,7 +110,7 @@ def to_dict(self) -> Dict[str, Any]: return default_to_dict(self, document_store=docstore, window_size=self.window_size) @classmethod - def from_dict(cls, data: Dict[str, Any]) -> "SentenceWindowRetriever": + def from_dict(cls, data: Dict[str, Any]) -> "SentenceWindowRetrieval": """ Deserializes the component from a dictionary. diff --git a/releasenotes/notes/revert-sentence-window-retrieval-8f5721772b28377e.yaml b/releasenotes/notes/revert-sentence-window-retrieval-8f5721772b28377e.yaml new file mode 100644 index 0000000000..5a74315972 --- /dev/null +++ b/releasenotes/notes/revert-sentence-window-retrieval-8f5721772b28377e.yaml @@ -0,0 +1,4 @@ +--- +upgrade: + - | + `SentenceWindowRetriever` component has been reverted back to `SenetenceWindowRetrieval`. diff --git a/test/components/retrievers/test_sentence_window_retriever.py b/test/components/retrievers/test_sentence_window_retrieval.py similarity index 78% rename from test/components/retrievers/test_sentence_window_retriever.py rename to test/components/retrievers/test_sentence_window_retrieval.py index d1752c4b33..a200144432 100644 --- a/test/components/retrievers/test_sentence_window_retriever.py +++ b/test/components/retrievers/test_sentence_window_retrieval.py @@ -2,23 +2,23 @@ from haystack import Document, DeserializationError, Pipeline from haystack.components.retrievers import InMemoryBM25Retriever -from haystack.components.retrievers.sentence_window_retriever import SentenceWindowRetriever +from haystack.components.retrievers.sentence_window_retrieval import SentenceWindowRetrieval from haystack.document_stores.in_memory import InMemoryDocumentStore from haystack.components.preprocessors import DocumentSplitter -class TestSentenceWindowRetriever: +class TestSentenceWindowRetrieval: def test_init_default(self): - retriever = SentenceWindowRetriever(InMemoryDocumentStore()) - assert retriever.window_size == 3 + retrieval = SentenceWindowRetrieval(InMemoryDocumentStore()) + assert retrieval.window_size == 3 def test_init_with_parameters(self): - retriever = SentenceWindowRetriever(InMemoryDocumentStore(), window_size=5) - assert retriever.window_size == 5 + retrieval = SentenceWindowRetrieval(InMemoryDocumentStore(), window_size=5) + assert retrieval.window_size == 5 def test_init_with_invalid_window_size_parameter(self): with pytest.raises(ValueError): - SentenceWindowRetriever(InMemoryDocumentStore(), window_size=-2) + SentenceWindowRetrieval(InMemoryDocumentStore(), window_size=-2) def test_merge_documents(self): docs = [ @@ -50,15 +50,15 @@ def test_merge_documents(self): "_split_overlap": [{"doc_id": "doc_1", "range": (23, 52)}], }, ] - merged_text = SentenceWindowRetriever.merge_documents_text([Document.from_dict(doc) for doc in docs]) + merged_text = SentenceWindowRetrieval.merge_documents_text([Document.from_dict(doc) for doc in docs]) expected = "This is a text with some words. There is a second sentence. And there is also a third sentence" assert merged_text == expected def test_to_dict(self): - window_retriever = SentenceWindowRetriever(InMemoryDocumentStore()) - data = window_retriever.to_dict() + window_retrieval = SentenceWindowRetrieval(InMemoryDocumentStore()) + data = window_retrieval.to_dict() - assert data["type"] == "haystack.components.retrievers.sentence_window_retriever.SentenceWindowRetriever" + assert data["type"] == "haystack.components.retrievers.sentence_window_retrieval.SentenceWindowRetrieval" assert data["init_parameters"]["window_size"] == 3 assert ( data["init_parameters"]["document_store"]["type"] @@ -67,7 +67,7 @@ def test_to_dict(self): def test_from_dict(self): data = { - "type": "haystack.components.retrievers.sentence_window_retriever.SentenceWindowRetriever", + "type": "haystack.components.retrievers.sentence_window_retrieval.SentenceWindowRetrieval", "init_parameters": { "document_store": { "type": "haystack.document_stores.in_memory.document_store.InMemoryDocumentStore", @@ -76,27 +76,27 @@ def test_from_dict(self): "window_size": 5, }, } - component = SentenceWindowRetriever.from_dict(data) + component = SentenceWindowRetrieval.from_dict(data) assert isinstance(component.document_store, InMemoryDocumentStore) assert component.window_size == 5 def test_from_dict_without_docstore(self): - data = {"type": "SentenceWindowRetriever", "init_parameters": {}} + data = {"type": "SentenceWindowRetrieval", "init_parameters": {}} with pytest.raises(DeserializationError, match="Missing 'document_store' in serialization data"): - SentenceWindowRetriever.from_dict(data) + SentenceWindowRetrieval.from_dict(data) def test_from_dict_without_docstore_type(self): - data = {"type": "SentenceWindowRetriever", "init_parameters": {"document_store": {"init_parameters": {}}}} + data = {"type": "SentenceWindowRetrieval", "init_parameters": {"document_store": {"init_parameters": {}}}} with pytest.raises(DeserializationError, match="Missing 'type' in document store's serialization data"): - SentenceWindowRetriever.from_dict(data) + SentenceWindowRetrieval.from_dict(data) def test_from_dict_non_existing_docstore(self): data = { - "type": "SentenceWindowRetriever", + "type": "SentenceWindowRetrieval", "init_parameters": {"document_store": {"type": "Nonexisting.Docstore", "init_parameters": {}}}, } with pytest.raises(DeserializationError): - SentenceWindowRetriever.from_dict(data) + SentenceWindowRetrieval.from_dict(data) def test_document_without_split_id(self): docs = [ @@ -104,8 +104,8 @@ def test_document_without_split_id(self): Document(content="some words. There is a second sentence. And there is ", meta={"id": "doc_1"}), ] with pytest.raises(ValueError): - retriever = SentenceWindowRetriever(document_store=InMemoryDocumentStore(), window_size=3) - retriever.run(retrieved_documents=docs) + retrieval = SentenceWindowRetrieval(document_store=InMemoryDocumentStore(), window_size=3) + retrieval.run(retrieved_documents=docs) def test_document_without_source_id(self): docs = [ @@ -115,8 +115,8 @@ def test_document_without_source_id(self): ), ] with pytest.raises(ValueError): - retriever = SentenceWindowRetriever(document_store=InMemoryDocumentStore(), window_size=3) - retriever.run(retrieved_documents=docs) + retrieval = SentenceWindowRetrieval(document_store=InMemoryDocumentStore(), window_size=3) + retrieval.run(retrieved_documents=docs) @pytest.mark.integration def test_run_with_pipeline(self): @@ -132,12 +132,12 @@ def test_run_with_pipeline(self): rag = Pipeline() rag.add_component("bm25_retriever", InMemoryBM25Retriever(doc_store, top_k=1)) - rag.add_component("sentence_window_retriever", SentenceWindowRetriever(document_store=doc_store, window_size=2)) - rag.connect("bm25_retriever", "sentence_window_retriever") + rag.add_component("sentence_window_retrieval", SentenceWindowRetrieval(document_store=doc_store, window_size=2)) + rag.connect("bm25_retriever", "sentence_window_retrieval") result = rag.run({"bm25_retriever": {"query": "third"}}) expected = { - "sentence_window_retriever": { + "sentence_window_retrieval": { "context_windows": [ "some words. There is a second sentence. And there is also a third sentence. It also " "contains a fourth sentence. And a fifth sentence. And a sixth sentence. And a "