diff --git a/haystack/components/converters/tika.py b/haystack/components/converters/tika.py
index 2bc75dd2cc..926968a176 100644
--- a/haystack/components/converters/tika.py
+++ b/haystack/components/converters/tika.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import io
+from html.parser import HTMLParser
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
 
@@ -17,6 +18,36 @@
 logger = logging.getLogger(__name__)
 
 
+class XHTMLParser(HTMLParser):
+    """
+    Custom parser to extract pages from Tika XHTML content.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.ingest = True
+        self.page = ""
+        self.pages: List[str] = []
+
+    def handle_starttag(self, tag: str, attrs: List[tuple]):
+        """Identify the start of a page div."""
+        if tag == "div" and any(attr == "class" and value == "page" for attr, value in attrs):
+            self.ingest = True
+
+    def handle_endtag(self, tag: str):
+        """Identify the end of a page div."""
+        if self.ingest and tag in ("div", "body"):
+            self.ingest = False
+            # restore words hyphened to the next line
+            self.pages.append(self.page.replace("-\n", ""))
+            self.page = ""
+
+    def handle_data(self, data: str):
+        """Populate the page content."""
+        if self.ingest:
+            self.page += data
+
+
 @component
 class TikaDocumentConverter:
     """
@@ -85,7 +116,14 @@ def run(
                 logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
                 continue
             try:
-                text = tika_parser.from_buffer(io.BytesIO(bytestream.data), serverEndpoint=self.tika_url)["content"]
+                # we extract the content as XHTML to preserve the structure of the document as much as possible
+                # this works for PDFs, but does not work for other file types (DOCX)
+                xhtml_content = tika_parser.from_buffer(
+                    io.BytesIO(bytestream.data), serverEndpoint=self.tika_url, xmlContent=True
+                )["content"]
+                xhtml_parser = XHTMLParser()
+                xhtml_parser.feed(xhtml_content)
+                text = "\f".join(xhtml_parser.pages)
             except Exception as conversion_e:
                 logger.warning(
                     "Failed to extract text from {source}. Skipping it. Error: {error}",
diff --git a/releasenotes/notes/fix-tika-page_number-2d600b2dc8a4faa7.yaml b/releasenotes/notes/fix-tika-page_number-2d600b2dc8a4faa7.yaml
new file mode 100644
index 0000000000..e66f422958
--- /dev/null
+++ b/releasenotes/notes/fix-tika-page_number-2d600b2dc8a4faa7.yaml
@@ -0,0 +1,5 @@
+---
+enhancements:
+  - |
+    `TikaDocumentConverter` now returns page breaks ("\f") in the output.
+    This only works for PDF files.
diff --git a/test/components/converters/test_tika_doc_converter.py b/test/components/converters/test_tika_doc_converter.py
index 9d1ef7d538..42ba3ebe1d 100644
--- a/test/components/converters/test_tika_doc_converter.py
+++ b/test/components/converters/test_tika_doc_converter.py
@@ -12,7 +12,7 @@
 class TestTikaDocumentConverter:
     @patch("haystack.components.converters.tika.tika_parser.from_buffer")
     def test_run(self, mock_tika_parser):
-        mock_tika_parser.return_value = {"content": "Content of mock source"}
+        mock_tika_parser.return_value = {"content": "<div><p>Content of mock source</p></div>"}
 
         component = TikaDocumentConverter()
         source = ByteStream(data=b"placeholder data")
@@ -61,6 +61,8 @@ def test_run_with_pdf_file(self, test_files_path):
         assert "A sample PDF file" in documents[0].content
         assert "Page 2 of Sample PDF" in documents[0].content
         assert "Page 4 of Sample PDF" in documents[0].content
+        assert documents[0].content.count("\f") == 3  # 4 pages
+
         assert "First Page" in documents[1].content
         assert (
             "Wiki engines usually allow content to be written using a simplified markup language"
@@ -68,6 +70,7 @@ def test_run_with_pdf_file(self, test_files_path):
         )
         assert "This section needs additional citations for verification." in documents[1].content
         assert "This would make it easier for other users to find the article." in documents[1].content
+        assert documents[1].content.count("\f") == 3  # 4 pages
 
     @pytest.mark.integration
     def test_run_with_docx_file(self, test_files_path):