Skip to content

Commit

Permalink
fix: Tika converter not yielding page break tags (\f) (#8082)
Browse files Browse the repository at this point in the history
* Fix TikaConverter not having \f page tag by using HTML mode of parsing and then parsing the HTML to text using the old Haystack 1.X integration as template.

* Add Reno

* Fix test by making Mock Tika return XML (before parsing)

* refinements and test

---------

Co-authored-by: anakin87 <stefanofiorucci@gmail.com>
  • Loading branch information
lambda-science and anakin87 committed Jul 26, 2024
1 parent e0de423 commit 1c53aae
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 2 deletions.
40 changes: 39 additions & 1 deletion haystack/components/converters/tika.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# SPDX-License-Identifier: Apache-2.0

import io
from html.parser import HTMLParser
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

Expand All @@ -17,6 +18,36 @@
logger = logging.getLogger(__name__)


class XHTMLParser(HTMLParser):
"""
Custom parser to extract pages from Tika XHTML content.
"""

def __init__(self):
super().__init__()
self.ingest = True
self.page = ""
self.pages: List[str] = []

def handle_starttag(self, tag: str, attrs: List[tuple]):
"""Identify the start of a page div."""
if tag == "div" and any(attr == "class" and value == "page" for attr, value in attrs):
self.ingest = True

def handle_endtag(self, tag: str):
"""Identify the end of a page div."""
if self.ingest and tag in ("div", "body"):
self.ingest = False
# restore words hyphened to the next line
self.pages.append(self.page.replace("-\n", ""))
self.page = ""

def handle_data(self, data: str):
"""Populate the page content."""
if self.ingest:
self.page += data


@component
class TikaDocumentConverter:
"""
Expand Down Expand Up @@ -85,7 +116,14 @@ def run(
logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
continue
try:
text = tika_parser.from_buffer(io.BytesIO(bytestream.data), serverEndpoint=self.tika_url)["content"]
# we extract the content as XHTML to preserve the structure of the document as much as possible
# this works for PDFs, but does not work for other file types (DOCX)
xhtml_content = tika_parser.from_buffer(
io.BytesIO(bytestream.data), serverEndpoint=self.tika_url, xmlContent=True
)["content"]
xhtml_parser = XHTMLParser()
xhtml_parser.feed(xhtml_content)
text = "\f".join(xhtml_parser.pages)
except Exception as conversion_e:
logger.warning(
"Failed to extract text from {source}. Skipping it. Error: {error}",
Expand Down
5 changes: 5 additions & 0 deletions releasenotes/notes/fix-tika-page_number-2d600b2dc8a4faa7.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
enhancements:
- |
`TikaDocumentConverter` now returns page breaks ("\f") in the output.
This only works for PDF files.
5 changes: 4 additions & 1 deletion test/components/converters/test_tika_doc_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
class TestTikaDocumentConverter:
@patch("haystack.components.converters.tika.tika_parser.from_buffer")
def test_run(self, mock_tika_parser):
mock_tika_parser.return_value = {"content": "Content of mock source"}
mock_tika_parser.return_value = {"content": "<div><p>Content of mock source</p></div>"}

component = TikaDocumentConverter()
source = ByteStream(data=b"placeholder data")
Expand Down Expand Up @@ -61,13 +61,16 @@ def test_run_with_pdf_file(self, test_files_path):
assert "A sample PDF file" in documents[0].content
assert "Page 2 of Sample PDF" in documents[0].content
assert "Page 4 of Sample PDF" in documents[0].content
assert documents[0].content.count("\f") == 3 # 4 pages

assert "First Page" in documents[1].content
assert (
"Wiki engines usually allow content to be written using a simplified markup language"
in documents[1].content
)
assert "This section needs additional citations for verification." in documents[1].content
assert "This would make it easier for other users to find the article." in documents[1].content
assert documents[1].content.count("\f") == 3 # 4 pages

@pytest.mark.integration
def test_run_with_docx_file(self, test_files_path):
Expand Down

0 comments on commit 1c53aae

Please sign in to comment.