Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Tika converter not yielding page break tags (\f) #8082

Merged
merged 5 commits into from
Jul 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 39 additions & 1 deletion haystack/components/converters/tika.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# SPDX-License-Identifier: Apache-2.0

import io
from html.parser import HTMLParser
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

Expand All @@ -17,6 +18,36 @@
logger = logging.getLogger(__name__)


class XHTMLParser(HTMLParser):
"""
Custom parser to extract pages from Tika XHTML content.
"""

def __init__(self):
super().__init__()
self.ingest = True
self.page = ""
self.pages: List[str] = []

def handle_starttag(self, tag: str, attrs: List[tuple]):
"""Identify the start of a page div."""
if tag == "div" and any(attr == "class" and value == "page" for attr, value in attrs):
self.ingest = True

def handle_endtag(self, tag: str):
"""Identify the end of a page div."""
if self.ingest and tag in ("div", "body"):
self.ingest = False
# restore words hyphened to the next line
self.pages.append(self.page.replace("-\n", ""))
self.page = ""

def handle_data(self, data: str):
"""Populate the page content."""
if self.ingest:
self.page += data


@component
class TikaDocumentConverter:
"""
Expand Down Expand Up @@ -85,7 +116,14 @@ def run(
logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
continue
try:
text = tika_parser.from_buffer(io.BytesIO(bytestream.data), serverEndpoint=self.tika_url)["content"]
# we extract the content as XHTML to preserve the structure of the document as much as possible
# this works for PDFs, but does not work for other file types (DOCX)
xhtml_content = tika_parser.from_buffer(
io.BytesIO(bytestream.data), serverEndpoint=self.tika_url, xmlContent=True
anakin87 marked this conversation as resolved.
Show resolved Hide resolved
)["content"]
xhtml_parser = XHTMLParser()
xhtml_parser.feed(xhtml_content)
text = "\f".join(xhtml_parser.pages)
except Exception as conversion_e:
logger.warning(
"Failed to extract text from {source}. Skipping it. Error: {error}",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
enhancements:
- |
`TikaDocumentConverter` now returns page breaks ("\f") in the output.
This only works for PDF files.
5 changes: 4 additions & 1 deletion test/components/converters/test_tika_doc_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
class TestTikaDocumentConverter:
@patch("haystack.components.converters.tika.tika_parser.from_buffer")
def test_run(self, mock_tika_parser):
mock_tika_parser.return_value = {"content": "Content of mock source"}
mock_tika_parser.return_value = {"content": "<div><p>Content of mock source</p></div>"}

component = TikaDocumentConverter()
source = ByteStream(data=b"placeholder data")
Expand Down Expand Up @@ -61,13 +61,16 @@ def test_run_with_pdf_file(self, test_files_path):
assert "A sample PDF file" in documents[0].content
assert "Page 2 of Sample PDF" in documents[0].content
assert "Page 4 of Sample PDF" in documents[0].content
assert documents[0].content.count("\f") == 3 # 4 pages

assert "First Page" in documents[1].content
assert (
"Wiki engines usually allow content to be written using a simplified markup language"
in documents[1].content
)
assert "This section needs additional citations for verification." in documents[1].content
assert "This would make it easier for other users to find the article." in documents[1].content
assert documents[1].content.count("\f") == 3 # 4 pages

@pytest.mark.integration
def test_run_with_docx_file(self, test_files_path):
Expand Down