Skip to content

Commit

Permalink
feat: Increase logging transparency for empty Documents during conver…
Browse files Browse the repository at this point in the history
…sion (#8509)

* Add log lines for PDF conversion and make skipping more explicit in DocumentSplitter

* Add logging statement for PDFMinerToDocument as well

* Add tests

* Remove unused line

* Remove unused line

* add reno

* Add in PDF file

* Update checks in PDF converters and add tests for document splitter

* Revert

* Remove line

* Fix comment

* Make mypy happy

* Make mypy happy
  • Loading branch information
sjrl authored Nov 4, 2024
1 parent 2595e68 commit 911f352
Show file tree
Hide file tree
Showing 8 changed files with 72 additions and 9 deletions.
12 changes: 9 additions & 3 deletions haystack/components/converters/pdfminer.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ class PDFMinerToDocument:
```
"""

def __init__(
def __init__( # pylint: disable=too-many-positional-arguments
self,
line_overlap: float = 0.5,
char_margin: float = 2.0,
Expand Down Expand Up @@ -92,7 +92,7 @@ def __init__(
all_texts=all_texts,
)

def __converter(self, extractor) -> Document:
def _converter(self, extractor) -> Document:
"""
Extracts text from PDF pages then convert the text into Documents
Expand Down Expand Up @@ -151,13 +151,19 @@ def run(
continue
try:
pdf_reader = extract_pages(io.BytesIO(bytestream.data), laparams=self.layout_params)
document = self.__converter(pdf_reader)
document = self._converter(pdf_reader)
except Exception as e:
logger.warning(
"Could not read {source} and convert it to Document, skipping. {error}", source=source, error=e
)
continue

if document.content is None or document.content.strip() == "":
logger.warning(
"PDFMinerToDocument could not extract text from the file {source}. Returning an empty document.",
source=source,
)

merged_metadata = {**bytestream.meta, **metadata}
document.meta = merged_metadata
documents.append(document)
Expand Down
10 changes: 8 additions & 2 deletions haystack/components/converters/pypdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,10 @@ def from_dict(cls, data):
:returns:
Deserialized component.
"""
custom_converter_data = data["init_parameters"]["converter"]
init_parameters = data.get("init_parameters", {})
custom_converter_data = init_parameters.get("converter")
if custom_converter_data is not None:
data["init_parameters"]["converter"] = deserialize_class_instance(custom_converter_data)

return default_from_dict(cls, data)

def _default_convert(self, reader: "PdfReader") -> Document:
Expand Down Expand Up @@ -142,6 +142,12 @@ def run(
)
continue

if document.content is None or document.content.strip() == "":
logger.warning(
"PyPDFToDocument could not extract text from the file {source}. Returning an empty document.",
source=source,
)

merged_metadata = {**bytestream.meta, **metadata}
document.meta = merged_metadata
documents.append(document)
Expand Down
10 changes: 8 additions & 2 deletions haystack/components/preprocessors/document_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,12 @@

from more_itertools import windowed

from haystack import Document, component
from haystack import Document, component, logging
from haystack.core.serialization import default_from_dict, default_to_dict
from haystack.utils import deserialize_callable, serialize_callable

logger = logging.getLogger(__name__)


@component
class DocumentSplitter:
Expand Down Expand Up @@ -46,7 +48,7 @@ class DocumentSplitter:
```
"""

def __init__(
def __init__( # pylint: disable=too-many-positional-arguments
self,
split_by: Literal["function", "page", "passage", "sentence", "word"] = "word",
split_length: int = 200,
Expand Down Expand Up @@ -112,6 +114,9 @@ def run(self, documents: List[Document]):
raise ValueError(
f"DocumentSplitter only works with text documents but content for document ID {doc.id} is None."
)
if doc.content == "":
logger.warning("Document ID {doc_id} has an empty content. Skipping this document.", doc_id=doc.id)
continue
units = self._split_into_units(doc.content, self.split_by)
text_splits, splits_pages, splits_start_idxs = self._concatenate_units(
units, self.split_length, self.split_overlap, self.split_threshold
Expand Down Expand Up @@ -173,6 +178,7 @@ def _concatenate_units(
# concatenate the last split with the current one
text_splits[-1] += txt

# NOTE: This line skips documents that have content=""
elif len(txt) > 0:
text_splits.append(txt)
splits_pages.append(cur_page)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
features:
- |
Add warning logs to the PDFMinerToDocument and PyPDFToDocument to indicate when a processed PDF file has no content.
This can happen if the PDF file is a scanned image.
Also added an explicit check and warning message to the DocumentSplitter that warns the user that empty Documents are skipped.
This behavior was already occurring, but now its clearer through logs that this is happening.
8 changes: 8 additions & 0 deletions test/components/converters/test_pdfminer_to_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,3 +126,11 @@ def test_run_error_handling(self, caplog):
results = converter.run(sources=sources)
assert "Could not read non_existing_file.pdf" in caplog.text
assert results["documents"] == []

def test_run_empty_document(self, caplog, test_files_path):
sources = [test_files_path / "pdf" / "non_text_searchable.pdf"]
converter = PDFMinerToDocument()
with caplog.at_level(logging.WARNING):
results = converter.run(sources=sources)
assert "PDFMinerToDocument could not extract text from the file" in caplog.text
assert results["documents"][0].content == ""
15 changes: 13 additions & 2 deletions test/components/converters/test_pypdf_to_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,13 @@ def test_from_dict(self):
assert isinstance(instance, PyPDFToDocument)
assert instance.converter is None

def test_from_dict_defaults(self):
data = {"type": "haystack.components.converters.pypdf.PyPDFToDocument", "init_parameters": {}}
instance = PyPDFToDocument.from_dict(data)
assert isinstance(instance, PyPDFToDocument)
assert instance.converter is None

def test_from_dict_custom_converter(self):
pypdf_converter = PyPDFToDocument(converter=CustomConverter())
data = pypdf_converter.to_dict()
data = {
"type": "haystack.components.converters.pypdf.PyPDFToDocument",
"init_parameters": {
Expand Down Expand Up @@ -161,3 +165,10 @@ def from_dict(cls, data):
assert len(docs) == 1
assert "ReAct" not in docs[0].content
assert "I don't care about converting given pdfs, I always return this" in docs[0].content

def test_run_empty_document(self, caplog, test_files_path):
paths = [test_files_path / "pdf" / "non_text_searchable.pdf"]
with caplog.at_level(logging.WARNING):
output = PyPDFToDocument().run(sources=paths)
assert "PyPDFToDocument could not extract text from the file" in caplog.text
assert output["documents"][0].content == ""
19 changes: 19 additions & 0 deletions test/components/preprocessors/test_document_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def test_non_text_document(self):
):
splitter = DocumentSplitter()
splitter.run(documents=[Document()])
assert "DocumentSplitter only works with text documents but content for document ID" in caplog.text

def test_single_doc(self):
with pytest.raises(TypeError, match="DocumentSplitter expects a List of Documents as input."):
Expand Down Expand Up @@ -445,3 +446,21 @@ def test_roundtrip_serialization_with_splitting_function(self):
assert original_splitter.split_by == deserialized_splitter.split_by
assert callable(deserialized_splitter.splitting_function)
assert deserialized_splitter.splitting_function("a.b.c") == ["a", "b", "c"]

def test_run_empty_document(self):
"""
Test if the component runs correctly with an empty document.
"""
splitter = DocumentSplitter()
doc = Document(content="")
results = splitter.run([doc])
assert results["documents"] == []

def test_run_document_only_whitespaces(self):
"""
Test if the component runs correctly with a document containing only whitespaces.
"""
splitter = DocumentSplitter()
doc = Document(content=" ")
results = splitter.run([doc])
assert results["documents"][0].content == " "
Binary file added test/test_files/pdf/non_text_searchable.pdf
Binary file not shown.

0 comments on commit 911f352

Please sign in to comment.