deepset-ai · anakin87 · Jul 26, 2024 · Jul 25, 2024 · Jul 25, 2024 · Jul 26, 2024
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import io
+from html.parser import HTMLParser
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
 
@@ -17,6 +18,36 @@
 logger = logging.getLogger(__name__)
 
 
+class XHTMLParser(HTMLParser):
+ """
+ Custom parser to extract pages from Tika XHTML content.
+ """
+
+ def __init__(self):
+ super().__init__()
+ self.ingest = True
+ self.page = ""
+ self.pages: List[str] = []
+
+ def handle_starttag(self, tag: str, attrs: List[tuple]):
+ """Identify the start of a page div."""
+ if tag == "div" and any(attr == "class" and value == "page" for attr, value in attrs):
+ self.ingest = True
+
+ def handle_endtag(self, tag: str):
+ """Identify the end of a page div."""
+ if self.ingest and tag in ("div", "body"):
+ self.ingest = False
+ # restore words hyphened to the next line
+ self.pages.append(self.page.replace("-\n", ""))
+ self.page = ""
+
+ def handle_data(self, data: str):
+ """Populate the page content."""
+ if self.ingest:
+ self.page += data
+
+
 @component
 class TikaDocumentConverter:
  """
@@ -85,7 +116,14 @@ def run(
  logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
  continue
  try:
- text = tika_parser.from_buffer(io.BytesIO(bytestream.data), serverEndpoint=self.tika_url)["content"]
+ # we extract the content as XHTML to preserve the structure of the document as much as possible
+ # this works for PDFs, but does not work for other file types (DOCX)
+ xhtml_content = tika_parser.from_buffer(
+ io.BytesIO(bytestream.data), serverEndpoint=self.tika_url, xmlContent=True
+ )["content"]
+ xhtml_parser = XHTMLParser()
+ xhtml_parser.feed(xhtml_content)
+ text = "\f".join(xhtml_parser.pages)
  except Exception as conversion_e:
  logger.warning(
  "Failed to extract text from {source}. Skipping it. Error: {error}",

@@ -0,0 +1,5 @@
+---
+enhancements:
+ - |
+ `TikaDocumentConverter` now returns page breaks ("\f") in the output.
+ This only works for PDF files.
@@ -12,7 +12,7 @@
 class TestTikaDocumentConverter:
  @patch("haystack.components.converters.tika.tika_parser.from_buffer")
  def test_run(self, mock_tika_parser):
- mock_tika_parser.return_value = {"content": "Content of mock source"}
+ mock_tika_parser.return_value = {"content": "<div><p>Content of mock source</p></div>"}
 
  component = TikaDocumentConverter()
  source = ByteStream(data=b"placeholder data")
@@ -61,13 +61,16 @@ def test_run_with_pdf_file(self, test_files_path):
  assert "A sample PDF file" in documents[0].content
  assert "Page 2 of Sample PDF" in documents[0].content
  assert "Page 4 of Sample PDF" in documents[0].content
+ assert documents[0].content.count("\f") == 3 # 4 pages
+
  assert "First Page" in documents[1].content
  assert (
  "Wiki engines usually allow content to be written using a simplified markup language"
  in documents[1].content
  )
  assert "This section needs additional citations for verification." in documents[1].content
  assert "This would make it easier for other users to find the article." in documents[1].content
+ assert documents[1].content.count("\f") == 3 # 4 pages
 
  @pytest.mark.integration
  def test_run_with_docx_file(self, test_files_path):