Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

upgrade: revert back to SentenceWindowRetrieval #8197

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/pydoc/config/retrievers_api.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ loaders:
"in_memory/bm25_retriever",
"in_memory/embedding_retriever",
"filter_retriever",
"sentence_window_retriever",
"sentence_window_retrieval",
]
ignore_when_discovered: ["__init__"]
processors:
Expand Down
4 changes: 2 additions & 2 deletions haystack/components/retrievers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,6 @@
from haystack.components.retrievers.filter_retriever import FilterRetriever
from haystack.components.retrievers.in_memory.bm25_retriever import InMemoryBM25Retriever
from haystack.components.retrievers.in_memory.embedding_retriever import InMemoryEmbeddingRetriever
from haystack.components.retrievers.sentence_window_retriever import SentenceWindowRetriever
from haystack.components.retrievers.sentence_window_retrieval import SentenceWindowRetrieval

__all__ = ["FilterRetriever", "InMemoryEmbeddingRetriever", "InMemoryBM25Retriever", "SentenceWindowRetriever"]
__all__ = ["FilterRetriever", "InMemoryEmbeddingRetriever", "InMemoryBM25Retriever", "SentenceWindowRetrieval"]
Original file line number Diff line number Diff line change
Expand Up @@ -10,27 +10,27 @@


@component
class SentenceWindowRetriever:
class SentenceWindowRetrieval:
"""
Retrieves documents adjacent to a given document in the Document Store.

During indexing, documents are broken into smaller chunks, or sentences. When you submit a query,
the Retriever fetches the most relevant sentence. To provide full context,
SentenceWindowRetriever fetches a number of neighboring sentences before and after each
SentenceWindowRetrieval fetches a number of neighboring sentences before and after each
relevant one. You can set this number with the `window_size` parameter.
It uses `source_id` and `doc.meta['split_id']` to locate the surrounding documents.

This component works with existing Retrievers, like BM25Retriever or
EmbeddingRetriever. First, use a Retriever to find documents based on a query and then use
SentenceWindowRetriever to get the surrounding documents for context.
SentenceWindowRetrieval to get the surrounding documents for context.


### Usage example

```python
from haystack import Document, Pipeline
from haystack.components.retrievers.in_memory import InMemoryBM25Retriever
from haystack.components.retrievers import SentenceWindowRetriever
from haystack.components.retrievers import SentenceWindowRetrieval
from haystack.components.preprocessors import DocumentSplitter
from haystack.document_stores.in_memory import InMemoryDocumentStore

Expand All @@ -47,7 +47,7 @@ class SentenceWindowRetriever:

rag = Pipeline()
rag.add_component("bm25_retriever", InMemoryBM25Retriever(doc_store, top_k=1))
rag.add_component("sentence_window_retriever", SentenceWindowRetriever(document_store=doc_store, window_size=2))
rag.add_component("sentence_window_retriever", SentenceWindowRetrieval(document_store=doc_store, window_size=2))
rag.connect("bm25_retriever", "sentence_window_retriever")

rag.run({'bm25_retriever': {"query":"third"}})
Expand All @@ -60,7 +60,7 @@ class SentenceWindowRetriever:

def __init__(self, document_store: DocumentStore, window_size: int = 3):
"""
Creates a new SentenceWindowRetriever component.
Creates a new SentenceWindowRetrieval component.

:param document_store: The Document Store to retrieve the surrounding documents from.
:param window_size: The number of documents to retrieve before and after the relevant one.
Expand Down Expand Up @@ -110,7 +110,7 @@ def to_dict(self) -> Dict[str, Any]:
return default_to_dict(self, document_store=docstore, window_size=self.window_size)

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "SentenceWindowRetriever":
def from_dict(cls, data: Dict[str, Any]) -> "SentenceWindowRetrieval":
"""
Deserializes the component from a dictionary.

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
upgrade:
- |
`SentenceWindowRetriever` component has been reverted back to `SenetenceWindowRetrieval`.
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,23 @@

from haystack import Document, DeserializationError, Pipeline
from haystack.components.retrievers import InMemoryBM25Retriever
from haystack.components.retrievers.sentence_window_retriever import SentenceWindowRetriever
from haystack.components.retrievers.sentence_window_retrieval import SentenceWindowRetrieval
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.preprocessors import DocumentSplitter


class TestSentenceWindowRetriever:
class TestSentenceWindowRetrieval:
def test_init_default(self):
retriever = SentenceWindowRetriever(InMemoryDocumentStore())
assert retriever.window_size == 3
retrieval = SentenceWindowRetrieval(InMemoryDocumentStore())
assert retrieval.window_size == 3

def test_init_with_parameters(self):
retriever = SentenceWindowRetriever(InMemoryDocumentStore(), window_size=5)
assert retriever.window_size == 5
retrieval = SentenceWindowRetrieval(InMemoryDocumentStore(), window_size=5)
assert retrieval.window_size == 5

def test_init_with_invalid_window_size_parameter(self):
with pytest.raises(ValueError):
SentenceWindowRetriever(InMemoryDocumentStore(), window_size=-2)
SentenceWindowRetrieval(InMemoryDocumentStore(), window_size=-2)

def test_merge_documents(self):
docs = [
Expand Down Expand Up @@ -50,15 +50,15 @@ def test_merge_documents(self):
"_split_overlap": [{"doc_id": "doc_1", "range": (23, 52)}],
},
]
merged_text = SentenceWindowRetriever.merge_documents_text([Document.from_dict(doc) for doc in docs])
merged_text = SentenceWindowRetrieval.merge_documents_text([Document.from_dict(doc) for doc in docs])
expected = "This is a text with some words. There is a second sentence. And there is also a third sentence"
assert merged_text == expected

def test_to_dict(self):
window_retriever = SentenceWindowRetriever(InMemoryDocumentStore())
data = window_retriever.to_dict()
window_retrieval = SentenceWindowRetrieval(InMemoryDocumentStore())
data = window_retrieval.to_dict()

assert data["type"] == "haystack.components.retrievers.sentence_window_retriever.SentenceWindowRetriever"
assert data["type"] == "haystack.components.retrievers.sentence_window_retrieval.SentenceWindowRetrieval"
assert data["init_parameters"]["window_size"] == 3
assert (
data["init_parameters"]["document_store"]["type"]
Expand All @@ -67,7 +67,7 @@ def test_to_dict(self):

def test_from_dict(self):
data = {
"type": "haystack.components.retrievers.sentence_window_retriever.SentenceWindowRetriever",
"type": "haystack.components.retrievers.sentence_window_retrieval.SentenceWindowRetrieval",
"init_parameters": {
"document_store": {
"type": "haystack.document_stores.in_memory.document_store.InMemoryDocumentStore",
Expand All @@ -76,36 +76,36 @@ def test_from_dict(self):
"window_size": 5,
},
}
component = SentenceWindowRetriever.from_dict(data)
component = SentenceWindowRetrieval.from_dict(data)
assert isinstance(component.document_store, InMemoryDocumentStore)
assert component.window_size == 5

def test_from_dict_without_docstore(self):
data = {"type": "SentenceWindowRetriever", "init_parameters": {}}
data = {"type": "SentenceWindowRetrieval", "init_parameters": {}}
with pytest.raises(DeserializationError, match="Missing 'document_store' in serialization data"):
SentenceWindowRetriever.from_dict(data)
SentenceWindowRetrieval.from_dict(data)

def test_from_dict_without_docstore_type(self):
data = {"type": "SentenceWindowRetriever", "init_parameters": {"document_store": {"init_parameters": {}}}}
data = {"type": "SentenceWindowRetrieval", "init_parameters": {"document_store": {"init_parameters": {}}}}
with pytest.raises(DeserializationError, match="Missing 'type' in document store's serialization data"):
SentenceWindowRetriever.from_dict(data)
SentenceWindowRetrieval.from_dict(data)

def test_from_dict_non_existing_docstore(self):
data = {
"type": "SentenceWindowRetriever",
"type": "SentenceWindowRetrieval",
"init_parameters": {"document_store": {"type": "Nonexisting.Docstore", "init_parameters": {}}},
}
with pytest.raises(DeserializationError):
SentenceWindowRetriever.from_dict(data)
SentenceWindowRetrieval.from_dict(data)

def test_document_without_split_id(self):
docs = [
Document(content="This is a text with some words. There is a ", meta={"id": "doc_0"}),
Document(content="some words. There is a second sentence. And there is ", meta={"id": "doc_1"}),
]
with pytest.raises(ValueError):
retriever = SentenceWindowRetriever(document_store=InMemoryDocumentStore(), window_size=3)
retriever.run(retrieved_documents=docs)
retrieval = SentenceWindowRetrieval(document_store=InMemoryDocumentStore(), window_size=3)
retrieval.run(retrieved_documents=docs)

def test_document_without_source_id(self):
docs = [
Expand All @@ -115,8 +115,8 @@ def test_document_without_source_id(self):
),
]
with pytest.raises(ValueError):
retriever = SentenceWindowRetriever(document_store=InMemoryDocumentStore(), window_size=3)
retriever.run(retrieved_documents=docs)
retrieval = SentenceWindowRetrieval(document_store=InMemoryDocumentStore(), window_size=3)
retrieval.run(retrieved_documents=docs)

@pytest.mark.integration
def test_run_with_pipeline(self):
Expand All @@ -132,12 +132,12 @@ def test_run_with_pipeline(self):

rag = Pipeline()
rag.add_component("bm25_retriever", InMemoryBM25Retriever(doc_store, top_k=1))
rag.add_component("sentence_window_retriever", SentenceWindowRetriever(document_store=doc_store, window_size=2))
rag.connect("bm25_retriever", "sentence_window_retriever")
rag.add_component("sentence_window_retrieval", SentenceWindowRetrieval(document_store=doc_store, window_size=2))
rag.connect("bm25_retriever", "sentence_window_retrieval")
result = rag.run({"bm25_retriever": {"query": "third"}})

expected = {
"sentence_window_retriever": {
"sentence_window_retrieval": {
"context_windows": [
"some words. There is a second sentence. And there is also a third sentence. It also "
"contains a fourth sentence. And a fifth sentence. And a sixth sentence. And a "
Expand Down
Loading