Propose PDFRouterParser and Loader

pprados · pprados · commit cbdaac01a91a · 2025-04-15T16:11:54.000+02:00
diff --git a/libs/community/extended_testing_deps.txt b/libs/community/extended_testing_deps.txt
@@ -59,7 +59,7 @@ openapi-pydantic>=0.3.2,<0.4
 oracle-ads>=2.9.1,<3
 oracledb>=2.2.0,<3
 pandas>=2.0.1,<3
-pdfminer-six==20231228
+pdfminer-six==20250327
 pdfplumber>=0.11
 pgvector>=0.1.6,<0.2
 playwright>=1.48.0,<2
diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import re
 import html
 import io
 import logging
@@ -1668,3 +1669,91 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-ty
             docs = self._generate_docs(blob, result)
 
             yield from docs
+
+class PDFRouterParser(BaseBlobParser):
+    """
+    Load PDFs using different parsers based on the metadata of the PDF
+    or the body of the first page.
+    The routes are defined as a list of tuples, where each tuple contains
+    the name, a dictionary of metadata and regex pattern and the parser to use.
+    The special key "page1" is to search in the first page with a regexp.
+    Use the route in the correct order, as the first matching route is used.
+    Add a default route ("default", {}, parser) at the end to catch all PDFs.
+    This code is similar to `MimeTypeBasedParser`, but on the content of the PDF file.
+
+    Sample:
+    ```python
+    from langchain_community.document_loaders import PyPDFLoader
+    from langchain_community.document_loaders.parsers.pdf import PyMuPDFParser
+    from langchain_community.document_loaders.parsers.pdf import PyPDFium2Parser
+    from langchain_community.document_loaders.parsers import PDFPlumberParser
+    routes = [
+        # Name, keys with regex, parser
+        ("Microsoft", {"producer": "Microsoft", "creator": "Microsoft"},
+        PyMuPDFParser()),
+        ("LibreOffice", {"producer": "LibreOffice", }, PDFPlumberParser()),
+        ("Xdvipdfmx", {"producer": "xdvipdfmx.*", "page1":"Hello"}, PDFPlumberParser()),
+        ("defautl", {}, PyPDFium2Parser())
+    ]
+    loader = PDFRouterLoader(filename, routes)
+    loader.load()
+    ```
+    """
+
+    def __init__(
+        self,
+        routes: list[
+            tuple[
+                str,
+                dict[str, Union[re.Pattern | str]],
+                BaseBlobParser,
+            ]
+        ],
+        *,
+        password: Optional[str] = None,
+    ):
+        try:
+            import pypdf  # noqa:F401
+        except ImportError:
+            raise ImportError(
+                "pypdf package not found, please install it with `pip install pypdf`"
+            )
+        super().__init__()
+        self.password = password
+        new_routes = []
+        for name, matchs, parser in routes:
+            new_matchs = {}
+            for k, v in matchs.items():
+                if isinstance(v, str):
+                    v = re.compile(v)
+                new_matchs[k] = v
+            new_routes.append((name, new_matchs, parser))
+        self.routes = new_routes
+
+    def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-type]
+        """Lazily parse the blob."""
+        try:
+            import pypdf  # noqa:F401
+        except ImportError:
+            raise ImportError(
+                "pypdf package not found, please install it with `pip install pypdf.six`"
+            )
+        from pypdf import PdfReader
+
+        with blob.as_bytes_io() as pdf_file_obj:  # type: ignore[attr-defined]
+            with PdfReader(pdf_file_obj, password=self.password) as reader:
+                metadata = _purge_metadata(cast(dict[str, Any], reader.metadata))
+                page1 = reader.pages[0].extract_text()
+                metadata["page1"] = page1
+                find = False
+                for name, match, parser in self.routes:
+                    for k, p in match.items():
+                        if k not in metadata or not p.search(metadata[k]):
+                            break
+                    else:
+                        find = True
+                        break
+                if find:
+                    for doc in parser.lazy_parse(blob):
+                        doc.metadata["router"] = name
+                        yield doc
diff --git a/libs/community/langchain_community/document_loaders/pdf.py b/libs/community/langchain_community/document_loaders/pdf.py
@@ -22,6 +22,8 @@
 from urllib.parse import urlparse
 
 import requests
+
+from langchain_core.document_loaders import BaseBlobParser
 from langchain_core.documents import Document
 from langchain_core.utils import get_from_dict_or_env
 
@@ -37,7 +39,7 @@
     PDFPlumberParser,
     PyMuPDFParser,
     PyPDFium2Parser,
-    PyPDFParser,
+    PyPDFParser, PDFRouterParser,
 )
 from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
 
@@ -1423,3 +1425,61 @@ def lazy_load(self) -> Iterator[Document]:
 
 # Legacy: only for backwards compatibility. Use PyPDFLoader instead
 PagedPDFSplitter = PyPDFLoader
+
+class PDFRouterLoader(BasePDFLoader):
+    """
+    Load PDFs using different parsers based on the metadata of the PDF
+    or the body of the first page.
+    The routes are defined as a list of tuples, where each tuple contains
+    the name, a dictionary of metadata and regex pattern and the parser to use.
+    The special key "page1" is to search in the first page with a regexp.
+    Use the route in the correct order, as the first matching route is used.
+    Add a default route ("default", {}, parser) at the end to catch all PDFs.
+
+    Sample:
+    ```python
+    from langchain_community.document_loaders import PyPDFLoader
+    from langchain_community.document_loaders.parsers.pdf import PyMuPDFParser
+    from langchain_community.document_loaders.parsers.pdf import PyPDFium2Parser
+    from langchain_community.document_loaders.parsers import PDFPlumberParser
+    routes = [
+        # Name, keys with regex, parser
+        ("Microsoft", {"producer": "Microsoft", "creator": "Microsoft"},
+        PyMuPDFParser()),
+        ("LibreOffice", {"producer": "LibreOffice", }, PDFPlumberParser()),
+        ("Xdvipdfmx", {"producer": "xdvipdfmx.*", "page1":"Hello"}, PDFPlumberParser()),
+        ("defautl", {}, PyPDFium2Parser())
+    ]
+    loader = PDFRouterLoader(filename, routes)
+    loader.load()
+    ```
+    """
+
+    def __init__(
+        self,
+        file_path: Union[str, Path],
+        *,
+        routes: list[
+            tuple[
+                str,
+                dict[str, Union[re.Pattern | str]],
+                BaseBlobParser,
+            ]
+        ],
+        password: Optional[str] = None,
+    ):
+        """Initialize with a file path."""
+        super().__init__(file_path)
+        self.parser = PDFRouterParser(routes, password=password)
+
+
+    def lazy_load(
+        self,
+    ) -> Iterator[Document]:
+        if self.web_path:
+            blob = Blob.from_data(
+                open(self.file_path, "rb").read(), path=self.web_path
+            )  # type: ignore[attr-defined]
+        else:
+            blob = Blob.from_path(self.file_path)  # type: ignore[attr-defined]
+        yield from self.parser.lazy_parse(blob)
diff --git a/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py b/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py
@@ -11,8 +11,10 @@
 from langchain_community.document_loaders.blob_loaders import Blob
 from langchain_community.document_loaders.parsers import (
     BaseImageBlobParser,
-    PDFPlumberParser,
+    PDFPlumberParser, PyMuPDFParser, PyPDFium2Parser,
 )
+from langchain_community.document_loaders.parsers.pdf import PDFRouterParser, \
+    PDFMinerParser
 
 if TYPE_CHECKING:
     from PIL.Image import Image
@@ -312,3 +314,27 @@ def _analyze_image(self, img: Image) -> str:
         **params,
     )
     _std_assert_with_parser(parser)
+
+def test_parser_router_parse() -> None:
+    mode = "single"
+    routes = [
+        (
+            "Microsoft",
+            {"producer": "Microsoft", "creator": "Microsoft"},
+            PyMuPDFParser(mode=mode),
+        ),
+        (
+            "LibreOffice",
+            {
+                "producer": "LibreOffice",
+            },
+            PDFMinerParser(mode=mode),
+        ),
+        (
+            "Xdvipdfmx",
+            {"producer": "xdvipdfmx.*", "page1": "Hello"},
+            PDFMinerParser(mode=mode),
+        ),
+        ("default", {}, PyPDFium2Parser(mode=mode)),
+    ]
+    _assert_with_parser(PDFRouterParser(routes=routes), splits_by_page=False)