langchain-ai · pprados · Apr 15, 2025 · Apr 15, 2025 · Apr 15, 2025 · Apr 16, 2025
diff --git a/libs/community/extended_testing_deps.txt b/libs/community/extended_testing_deps.txt
@@ -59,7 +59,7 @@ openapi-pydantic>=0.3.2,<0.4
 oracle-ads>=2.9.1,<3
 oracledb>=2.2.0,<3
 pandas>=2.0.1,<3
-pdfminer-six==20231228
+pdfminer-six==20250327
 pdfplumber>=0.11
 pgvector>=0.1.6,<0.2
 playwright>=1.48.0,<2

diff --git a/libs/community/langchain_community/document_loaders/parsers/__init__.py b/libs/community/langchain_community/document_loaders/parsers/__init__.py
@@ -29,6 +29,7 @@
     from langchain_community.document_loaders.parsers.pdf import (
         PDFMinerParser,
         PDFPlumberParser,
+        PDFRouterParser,
         PyMuPDFParser,
         PyPDFium2Parser,
         PyPDFParser,
@@ -51,6 +52,7 @@
     "PDFPlumberParser": "langchain_community.document_loaders.parsers.pdf",
     "PyMuPDFParser": "langchain_community.document_loaders.parsers.pdf",
     "PyPDFParser": "langchain_community.document_loaders.parsers.pdf",
+    "PDFRouterParser": "langchain_community.document_loaders.parsers.pdf",
     "PyPDFium2Parser": "langchain_community.document_loaders.parsers.pdf",
     "RapidOCRBlobParser": "langchain_community.document_loaders.parsers.images",
     "TesseractBlobParser": "langchain_community.document_loaders.parsers.images",
@@ -76,6 +78,7 @@ def __getattr__(name: str) -> Any:
     "OpenAIWhisperParser",
     "PDFMinerParser",
     "PDFPlumberParser",
+    "PDFRouterParser",
     "PyMuPDFParser",
     "PyPDFParser",
     "PyPDFium2Parser",

diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py
@@ -5,6 +5,7 @@
 import html
 import io
 import logging
+import re
 import threading
 import warnings
 from datetime import datetime
@@ -1668,3 +1669,95 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-ty
             docs = self._generate_docs(blob, result)
 
             yield from docs
+
+
+class PDFRouterParser(BaseBlobParser):
+    """
+    Load PDFs using different parsers based on the metadata of the PDF
+    or the body of the first page.
+    The routes are defined as a list of tuples, where each tuple contains
+    the name, a dictionary of metadata and regex pattern and the parser to use.
+    The special key "page1" is to search in the first page with a regexp.
+    Use the route in the correct order, as the first matching route is used.
+    Add a default route ("default", {}, parser) at the end to catch all PDFs.
+    This code is similar to `MimeTypeBasedParser`, but on the content of the PDF file.
+
+    Sample:
+    ```python
+    from langchain_community.document_loaders import PyPDFLoader
+    from langchain_community.document_loaders.parsers.pdf import PyMuPDFParser
+    from langchain_community.document_loaders.parsers.pdf import PyPDFium2Parser
+    from langchain_community.document_loaders.parsers import PDFPlumberParser
+    routes = [
+        # Name, keys with regex, parser
+        ("Microsoft", {"producer": "Microsoft", "creator": "Microsoft"},
+        PyMuPDFParser()),
+        ("LibreOffice", {"producer": "LibreOffice", }, PDFPlumberParser()),
+        ("Xdvipdfmx", {"producer": "xdvipdfmx.*", "page1":"Hello"}, PDFPlumberParser()),
+        ("defautl", {}, PyPDFium2Parser())
+    ]
+    loader = PDFRouterLoader(filename, routes=routes)
+    loader.load()
+    ```
+    """
+
+    Routes = Sequence[
+        tuple[
+            str,
+            Mapping[str, Union[re.Pattern, str]],
+            BaseBlobParser,
+        ]
+    ]
+
+    def __init__(
+        self,
+        routes: Routes,
+        *,
+        password: Optional[str] = None,
+    ):
+        try:
+            import pypdf  # noqa:F401
+        except ImportError:
+            raise ImportError(
+                "pypdf package not found, please install it with `pip install pypdf`"
+            )
+        super().__init__()
+        self.password = password
+        new_routes = []
+        for name, matchs, parser in routes:
+            new_matchs = {}
+            for k, v in matchs.items():
+                if isinstance(v, str):
+                    v = re.compile(v)
+                new_matchs[k] = v
+            new_routes.append((name, new_matchs, parser))
+        self.routes = new_routes
+
+    def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-type]
+        """Lazily parse the blob."""
+        try:
+            import pypdf  # noqa:F401
+        except ImportError:
+            raise ImportError(
+                "pypdf package not found, please install it with "
+                "`pip install pypdf.six`"
+            )
+        from pypdf import PdfReader
+
+        with blob.as_bytes_io() as pdf_file_obj:  # type: ignore[attr-defined]
+            with PdfReader(pdf_file_obj, password=self.password) as reader:
+                metadata = _purge_metadata(cast(dict[str, Any], reader.metadata))
+                page1 = reader.pages[0].extract_text()
+                metadata["page1"] = page1
+                find = False
+                for name, match, parser in self.routes:
+                    for k, p in match.items():
+                        if k not in metadata or not p.search(metadata[k]):
+                            break
+                    else:
+                        find = True
+                        break
+                if find:
+                    for doc in parser.lazy_parse(blob):
+                        doc.metadata["router"] = name
+                        yield doc
diff --git a/libs/community/langchain_community/document_loaders/pdf.py b/libs/community/langchain_community/document_loaders/pdf.py
@@ -1423,3 +1423,62 @@ def lazy_load(self) -> Iterator[Document]:
 
 # Legacy: only for backwards compatibility. Use PyPDFLoader instead
 PagedPDFSplitter = PyPDFLoader
+
+
+# class PDFRouterLoader(BasePDFLoader):
+#     """
+#     Load PDFs using different parsers based on the metadata of the PDF
+#     or the body of the first page.
+#     The routes are defined as a list of tuples, where each tuple contains
+#     the name, a dictionary of metadata and regex pattern and the parser to use.
+#     The special key "page1" is to search in the first page with a regexp.
+#     Use the route in the correct order, as the first matching route is used.
+#     Add a default route ("default", {}, parser) at the end to catch all PDFs.
+#
+#     Sample:
+#     ```python
+#     from langchain_community.document_loaders import PyPDFLoader
+#     from langchain_community.document_loaders.parsers.pdf import PyMuPDFParser
+#     from langchain_community.document_loaders.parsers.pdf import PyPDFium2Parser
+#     from langchain_community.document_loaders.parsers import PDFPlumberParser
+#     routes = [
+#         # Name, keys with regex, parser
+#         ("Microsoft", {"producer": "Microsoft", "creator": "Microsoft"},
+#         PyMuPDFParser()),
+#         ("LibreOffice", {"producer": "LibreOffice", }, PDFPlumberParser()),
+#         ("Xdvipdfmx", {"producer": "xdvipdfmx.*", "page1":"Hello"},
+#         PDFPlumberParser()),
+#         ("defautl", {}, PyPDFium2Parser())
+#     ]
+#     loader = PDFRouterLoader(filename, routes)
+#     loader.load()
+#     ```
+#     """
+#
+#     def __init__(
+#         self,
+#         file_path: Union[str, Path],
+#         *,
+#         routes: list[
+#             tuple[
+#                 str,
+#                 dict[str, Union[re.Pattern, str]],
+#                 BaseBlobParser,
+#             ]
+#         ],
+#         password: Optional[str] = None,
+#     ):
+#         """Initialize with a file path."""
+#         super().__init__(file_path)
+#         self.parser = PDFRouterParser(routes, password=password)
+#
+#     def lazy_load(
+#         self,
+#     ) -> Iterator[Document]:
+#         if self.web_path:
+#             blob = Blob.from_data(open(self.file_path, "rb").read(),
+#             path=self.web_path)  # type: ignore[attr-defined]
+#         else:
+#             blob = Blob.from_path(self.file_path)  # type: ignore[attr-defined]
+#         yield from self.parser.lazy_parse(blob)
+# FIXME
diff --git a/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py b/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py
@@ -2,7 +2,7 @@
 
 import re
 from pathlib import Path
-from typing import TYPE_CHECKING, Iterator
+from typing import TYPE_CHECKING, Iterator, Literal, Union, cast
 
 import pytest
 
@@ -11,7 +11,11 @@
 from langchain_community.document_loaders.blob_loaders import Blob
 from langchain_community.document_loaders.parsers import (
     BaseImageBlobParser,
+    PDFMinerParser,
     PDFPlumberParser,
+    PDFRouterParser,
+    PyMuPDFParser,
+    PyPDFium2Parser,
 )
 
 if TYPE_CHECKING:
@@ -312,3 +316,37 @@ def _analyze_image(self, img: Image) -> str:
         **params,
     )
     _std_assert_with_parser(parser)
+
+
+def test_parser_router_parse() -> None:
+    mode: Literal["single"] = "single"
+    routes: PDFRouterParser.Routes = [
+        (
+            "Xdvipdfmx",
+            {"producer": re.compile(r"xdvipdfmx.*"), "page1": "Hello"},
+            PDFMinerParser(mode=mode),
+        ),
+        (
+            "Microsoft",
+            {"producer": "Microsoft", "creator": "Microsoft"},
+            PyMuPDFParser(mode=mode),
+        ),
+        (
+            "LibreOffice",
+            {
+                "producer": "LibreOffice",
+            },
+            PDFMinerParser(mode=mode),
+        ),
+        (
+            "default",
+            cast(dict[str, Union[re.Pattern, str]], dict()),
+            PyPDFium2Parser(mode=mode),
+        ),
+    ]
+    _assert_with_parser(
+        PDFRouterParser(
+            routes=routes,
+        ),
+        splits_by_page=False,
+    )
diff --git a/libs/community/tests/unit_tests/document_loaders/parsers/test_public_api.py b/libs/community/tests/unit_tests/document_loaders/parsers/test_public_api.py
@@ -14,6 +14,7 @@ def test_parsers_public_api_correct() -> None:
         "OpenAIWhisperParser",
         "PyPDFParser",
         "PDFMinerParser",
+        "PDFRouterParser",
         "PyMuPDFParser",
         "PyPDFium2Parser",
         "PDFPlumberParser",