diff --git a/haystack/components/converters/tika.py b/haystack/components/converters/tika.py index 2bc75dd2cc..926968a176 100644 --- a/haystack/components/converters/tika.py +++ b/haystack/components/converters/tika.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 import io +from html.parser import HTMLParser from pathlib import Path from typing import Any, Dict, List, Optional, Union @@ -17,6 +18,36 @@ logger = logging.getLogger(__name__) +class XHTMLParser(HTMLParser): + """ + Custom parser to extract pages from Tika XHTML content. + """ + + def __init__(self): + super().__init__() + self.ingest = True + self.page = "" + self.pages: List[str] = [] + + def handle_starttag(self, tag: str, attrs: List[tuple]): + """Identify the start of a page div.""" + if tag == "div" and any(attr == "class" and value == "page" for attr, value in attrs): + self.ingest = True + + def handle_endtag(self, tag: str): + """Identify the end of a page div.""" + if self.ingest and tag in ("div", "body"): + self.ingest = False + # restore words hyphened to the next line + self.pages.append(self.page.replace("-\n", "")) + self.page = "" + + def handle_data(self, data: str): + """Populate the page content.""" + if self.ingest: + self.page += data + + @component class TikaDocumentConverter: """ @@ -85,7 +116,14 @@ def run( logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e) continue try: - text = tika_parser.from_buffer(io.BytesIO(bytestream.data), serverEndpoint=self.tika_url)["content"] + # we extract the content as XHTML to preserve the structure of the document as much as possible + # this works for PDFs, but does not work for other file types (DOCX) + xhtml_content = tika_parser.from_buffer( + io.BytesIO(bytestream.data), serverEndpoint=self.tika_url, xmlContent=True + )["content"] + xhtml_parser = XHTMLParser() + xhtml_parser.feed(xhtml_content) + text = "\f".join(xhtml_parser.pages) except Exception as conversion_e: logger.warning( "Failed to extract text from {source}. Skipping it. Error: {error}", diff --git a/releasenotes/notes/fix-tika-page_number-2d600b2dc8a4faa7.yaml b/releasenotes/notes/fix-tika-page_number-2d600b2dc8a4faa7.yaml new file mode 100644 index 0000000000..e66f422958 --- /dev/null +++ b/releasenotes/notes/fix-tika-page_number-2d600b2dc8a4faa7.yaml @@ -0,0 +1,5 @@ +--- +enhancements: + - | + `TikaDocumentConverter` now returns page breaks ("\f") in the output. + This only works for PDF files. diff --git a/test/components/converters/test_tika_doc_converter.py b/test/components/converters/test_tika_doc_converter.py index 9d1ef7d538..42ba3ebe1d 100644 --- a/test/components/converters/test_tika_doc_converter.py +++ b/test/components/converters/test_tika_doc_converter.py @@ -12,7 +12,7 @@ class TestTikaDocumentConverter: @patch("haystack.components.converters.tika.tika_parser.from_buffer") def test_run(self, mock_tika_parser): - mock_tika_parser.return_value = {"content": "Content of mock source"} + mock_tika_parser.return_value = {"content": "

Content of mock source

"} component = TikaDocumentConverter() source = ByteStream(data=b"placeholder data") @@ -61,6 +61,8 @@ def test_run_with_pdf_file(self, test_files_path): assert "A sample PDF file" in documents[0].content assert "Page 2 of Sample PDF" in documents[0].content assert "Page 4 of Sample PDF" in documents[0].content + assert documents[0].content.count("\f") == 3 # 4 pages + assert "First Page" in documents[1].content assert ( "Wiki engines usually allow content to be written using a simplified markup language" @@ -68,6 +70,7 @@ def test_run_with_pdf_file(self, test_files_path): ) assert "This section needs additional citations for verification." in documents[1].content assert "This would make it easier for other users to find the article." in documents[1].content + assert documents[1].content.count("\f") == 3 # 4 pages @pytest.mark.integration def test_run_with_docx_file(self, test_files_path):