feat: Increase logging transparency for empty Documents during conver…

…sion (#8509) * Add log lines for PDF conversion and make skipping more explicit in DocumentSplitter * Add logging statement for PDFMinerToDocument as well * Add tests * Remove unused line * Remove unused line * add reno * Add in PDF file * Update checks in PDF converters and add tests for document splitter * Revert * Remove line * Fix comment * Make mypy happy * Make mypy happy
deepset-ai · Nov 4, 2024 · 911f352 · 911f352
1 parent 2595e68
commit 911f352
Show file tree

Hide file tree

Showing 8 changed files with 72 additions and 9 deletions.
diff --git a/haystack/components/converters/pdfminer.py b/haystack/components/converters/pdfminer.py
@@ -37,7 +37,7 @@ class PDFMinerToDocument:
     ```
     """
 
-    def __init__(
+    def __init__(  # pylint: disable=too-many-positional-arguments
         self,
         line_overlap: float = 0.5,
         char_margin: float = 2.0,
@@ -92,7 +92,7 @@ def __init__(
             all_texts=all_texts,
         )
 
-    def __converter(self, extractor) -> Document:
+    def _converter(self, extractor) -> Document:
         """
         Extracts text from PDF pages then convert the text into Documents
 
@@ -151,13 +151,19 @@ def run(
                 continue
             try:
                 pdf_reader = extract_pages(io.BytesIO(bytestream.data), laparams=self.layout_params)
-                document = self.__converter(pdf_reader)
+                document = self._converter(pdf_reader)
             except Exception as e:
                 logger.warning(
                     "Could not read {source} and convert it to Document, skipping. {error}", source=source, error=e
                 )
                 continue
 
+            if document.content is None or document.content.strip() == "":
+                logger.warning(
+                    "PDFMinerToDocument could not extract text from the file {source}. Returning an empty document.",
+                    source=source,
+                )
+
             merged_metadata = {**bytestream.meta, **metadata}
             document.meta = merged_metadata
             documents.append(document)

diff --git a/haystack/components/converters/pypdf.py b/haystack/components/converters/pypdf.py
@@ -90,10 +90,10 @@ def from_dict(cls, data):
         :returns:
             Deserialized component.
         """
-        custom_converter_data = data["init_parameters"]["converter"]
+        init_parameters = data.get("init_parameters", {})
+        custom_converter_data = init_parameters.get("converter")
         if custom_converter_data is not None:
             data["init_parameters"]["converter"] = deserialize_class_instance(custom_converter_data)
-
         return default_from_dict(cls, data)
 
     def _default_convert(self, reader: "PdfReader") -> Document:
@@ -142,6 +142,12 @@ def run(
                 )
                 continue
 
+            if document.content is None or document.content.strip() == "":
+                logger.warning(
+                    "PyPDFToDocument could not extract text from the file {source}. Returning an empty document.",
+                    source=source,
+                )
+
             merged_metadata = {**bytestream.meta, **metadata}
             document.meta = merged_metadata
             documents.append(document)

diff --git a/haystack/components/preprocessors/document_splitter.py b/haystack/components/preprocessors/document_splitter.py
@@ -7,10 +7,12 @@
 
 from more_itertools import windowed
 
-from haystack import Document, component
+from haystack import Document, component, logging
 from haystack.core.serialization import default_from_dict, default_to_dict
 from haystack.utils import deserialize_callable, serialize_callable
 
+logger = logging.getLogger(__name__)
+
 
 @component
 class DocumentSplitter:
@@ -46,7 +48,7 @@ class DocumentSplitter:
     ```
     """
 
-    def __init__(
+    def __init__(  # pylint: disable=too-many-positional-arguments
         self,
         split_by: Literal["function", "page", "passage", "sentence", "word"] = "word",
         split_length: int = 200,
@@ -112,6 +114,9 @@ def run(self, documents: List[Document]):
                 raise ValueError(
                     f"DocumentSplitter only works with text documents but content for document ID {doc.id} is None."
                 )
+            if doc.content == "":
+                logger.warning("Document ID {doc_id} has an empty content. Skipping this document.", doc_id=doc.id)
+                continue
             units = self._split_into_units(doc.content, self.split_by)
             text_splits, splits_pages, splits_start_idxs = self._concatenate_units(
                 units, self.split_length, self.split_overlap, self.split_threshold
@@ -173,6 +178,7 @@ def _concatenate_units(
                 # concatenate the last split with the current one
                 text_splits[-1] += txt
 
+            # NOTE: This line skips documents that have content=""
             elif len(txt) > 0:
                 text_splits.append(txt)
                 splits_pages.append(cur_page)

diff --git a/releasenotes/notes/add-logs-empty-files-pdf-f28a14e52984c1ea.yaml b/releasenotes/notes/add-logs-empty-files-pdf-f28a14e52984c1ea.yaml
@@ -0,0 +1,7 @@
+---
+features:
+  - |
+    Add warning logs to the PDFMinerToDocument and PyPDFToDocument to indicate when a processed PDF file has no content.
+    This can happen if the PDF file is a scanned image.
+    Also added an explicit check and warning message to the DocumentSplitter that warns the user that empty Documents are skipped.
+    This behavior was already occurring, but now its clearer through logs that this is happening.
diff --git a/test/components/converters/test_pdfminer_to_document.py b/test/components/converters/test_pdfminer_to_document.py
@@ -126,3 +126,11 @@ def test_run_error_handling(self, caplog):
             results = converter.run(sources=sources)
             assert "Could not read non_existing_file.pdf" in caplog.text
             assert results["documents"] == []
+
+    def test_run_empty_document(self, caplog, test_files_path):
+        sources = [test_files_path / "pdf" / "non_text_searchable.pdf"]
+        converter = PDFMinerToDocument()
+        with caplog.at_level(logging.WARNING):
+            results = converter.run(sources=sources)
+            assert "PDFMinerToDocument could not extract text from the file" in caplog.text
+            assert results["documents"][0].content == ""
diff --git a/test/components/converters/test_pypdf_to_document.py b/test/components/converters/test_pypdf_to_document.py
@@ -63,9 +63,13 @@ def test_from_dict(self):
         assert isinstance(instance, PyPDFToDocument)
         assert instance.converter is None
 
+    def test_from_dict_defaults(self):
+        data = {"type": "haystack.components.converters.pypdf.PyPDFToDocument", "init_parameters": {}}
+        instance = PyPDFToDocument.from_dict(data)
+        assert isinstance(instance, PyPDFToDocument)
+        assert instance.converter is None
+
     def test_from_dict_custom_converter(self):
-        pypdf_converter = PyPDFToDocument(converter=CustomConverter())
-        data = pypdf_converter.to_dict()
         data = {
             "type": "haystack.components.converters.pypdf.PyPDFToDocument",
             "init_parameters": {
@@ -161,3 +165,10 @@ def from_dict(cls, data):
         assert len(docs) == 1
         assert "ReAct" not in docs[0].content
         assert "I don't care about converting given pdfs, I always return this" in docs[0].content
+
+    def test_run_empty_document(self, caplog, test_files_path):
+        paths = [test_files_path / "pdf" / "non_text_searchable.pdf"]
+        with caplog.at_level(logging.WARNING):
+            output = PyPDFToDocument().run(sources=paths)
+            assert "PyPDFToDocument could not extract text from the file" in caplog.text
+            assert output["documents"][0].content == ""
diff --git a/test/components/preprocessors/test_document_splitter.py b/test/components/preprocessors/test_document_splitter.py
@@ -43,6 +43,7 @@ def test_non_text_document(self):
         ):
             splitter = DocumentSplitter()
             splitter.run(documents=[Document()])
+            assert "DocumentSplitter only works with text documents but content for document ID" in caplog.text
 
     def test_single_doc(self):
         with pytest.raises(TypeError, match="DocumentSplitter expects a List of Documents as input."):
@@ -445,3 +446,21 @@ def test_roundtrip_serialization_with_splitting_function(self):
         assert original_splitter.split_by == deserialized_splitter.split_by
         assert callable(deserialized_splitter.splitting_function)
         assert deserialized_splitter.splitting_function("a.b.c") == ["a", "b", "c"]
+
+    def test_run_empty_document(self):
+        """
+        Test if the component runs correctly with an empty document.
+        """
+        splitter = DocumentSplitter()
+        doc = Document(content="")
+        results = splitter.run([doc])
+        assert results["documents"] == []
+
+    def test_run_document_only_whitespaces(self):
+        """
+        Test if the component runs correctly with a document containing only whitespaces.
+        """
+        splitter = DocumentSplitter()
+        doc = Document(content="  ")
+        results = splitter.run([doc])
+        assert results["documents"][0].content == "  "
diff --git a/test/test_files/pdf/non_text_searchable.pdf b/test/test_files/pdf/non_text_searchable.pdf