Skip to content

[community] Propose PDFRouterParser and Loader #30847

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 5 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion libs/community/extended_testing_deps.txt
Original file line number Diff line number Diff line change
@@ -59,7 +59,7 @@ openapi-pydantic>=0.3.2,<0.4
oracle-ads>=2.9.1,<3
oracledb>=2.2.0,<3
pandas>=2.0.1,<3
pdfminer-six==20231228
pdfminer-six==20250327
pdfplumber>=0.11
pgvector>=0.1.6,<0.2
playwright>=1.48.0,<2
Original file line number Diff line number Diff line change
@@ -29,6 +29,7 @@
from langchain_community.document_loaders.parsers.pdf import (
PDFMinerParser,
PDFPlumberParser,
PDFRouterParser,
PyMuPDFParser,
PyPDFium2Parser,
PyPDFParser,
@@ -51,6 +52,7 @@
"PDFPlumberParser": "langchain_community.document_loaders.parsers.pdf",
"PyMuPDFParser": "langchain_community.document_loaders.parsers.pdf",
"PyPDFParser": "langchain_community.document_loaders.parsers.pdf",
"PDFRouterParser": "langchain_community.document_loaders.parsers.pdf",
"PyPDFium2Parser": "langchain_community.document_loaders.parsers.pdf",
"RapidOCRBlobParser": "langchain_community.document_loaders.parsers.images",
"TesseractBlobParser": "langchain_community.document_loaders.parsers.images",
@@ -76,6 +78,7 @@ def __getattr__(name: str) -> Any:
"OpenAIWhisperParser",
"PDFMinerParser",
"PDFPlumberParser",
"PDFRouterParser",
"PyMuPDFParser",
"PyPDFParser",
"PyPDFium2Parser",
93 changes: 93 additions & 0 deletions libs/community/langchain_community/document_loaders/parsers/pdf.py
Original file line number Diff line number Diff line change
@@ -5,6 +5,7 @@
import html
import io
import logging
import re
import threading
import warnings
from datetime import datetime
@@ -1668,3 +1669,95 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty
docs = self._generate_docs(blob, result)

yield from docs


class PDFRouterParser(BaseBlobParser):
"""
Load PDFs using different parsers based on the metadata of the PDF
or the body of the first page.
The routes are defined as a list of tuples, where each tuple contains
the name, a dictionary of metadata and regex pattern and the parser to use.
The special key "page1" is to search in the first page with a regexp.
Use the route in the correct order, as the first matching route is used.
Add a default route ("default", {}, parser) at the end to catch all PDFs.
This code is similar to `MimeTypeBasedParser`, but on the content of the PDF file.

Sample:
```python
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders.parsers.pdf import PyMuPDFParser
from langchain_community.document_loaders.parsers.pdf import PyPDFium2Parser
from langchain_community.document_loaders.parsers import PDFPlumberParser
routes = [
# Name, keys with regex, parser
("Microsoft", {"producer": "Microsoft", "creator": "Microsoft"},
PyMuPDFParser()),
("LibreOffice", {"producer": "LibreOffice", }, PDFPlumberParser()),
("Xdvipdfmx", {"producer": "xdvipdfmx.*", "page1":"Hello"}, PDFPlumberParser()),
("defautl", {}, PyPDFium2Parser())
]
loader = PDFRouterLoader(filename, routes=routes)
loader.load()
```
"""

Routes = Sequence[
tuple[
str,
Mapping[str, Union[re.Pattern, str]],
BaseBlobParser,
]
]

def __init__(
self,
routes: Routes,
*,
password: Optional[str] = None,
):
try:
import pypdf # noqa:F401
except ImportError:
raise ImportError(
"pypdf package not found, please install it with `pip install pypdf`"
)
super().__init__()
self.password = password
new_routes = []
for name, matchs, parser in routes:
new_matchs = {}
for k, v in matchs.items():
if isinstance(v, str):
v = re.compile(v)
new_matchs[k] = v
new_routes.append((name, new_matchs, parser))
self.routes = new_routes

def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
"""Lazily parse the blob."""
try:
import pypdf # noqa:F401
except ImportError:
raise ImportError(
"pypdf package not found, please install it with "
"`pip install pypdf.six`"
)
from pypdf import PdfReader

with blob.as_bytes_io() as pdf_file_obj: # type: ignore[attr-defined]
with PdfReader(pdf_file_obj, password=self.password) as reader:
metadata = _purge_metadata(cast(dict[str, Any], reader.metadata))
page1 = reader.pages[0].extract_text()
metadata["page1"] = page1
find = False
for name, match, parser in self.routes:
for k, p in match.items():
if k not in metadata or not p.search(metadata[k]):
break
else:
find = True
break
if find:
for doc in parser.lazy_parse(blob):
doc.metadata["router"] = name
yield doc
59 changes: 59 additions & 0 deletions libs/community/langchain_community/document_loaders/pdf.py
Original file line number Diff line number Diff line change
@@ -1423,3 +1423,62 @@ def lazy_load(self) -> Iterator[Document]:

# Legacy: only for backwards compatibility. Use PyPDFLoader instead
PagedPDFSplitter = PyPDFLoader


# class PDFRouterLoader(BasePDFLoader):
# """
# Load PDFs using different parsers based on the metadata of the PDF
# or the body of the first page.
# The routes are defined as a list of tuples, where each tuple contains
# the name, a dictionary of metadata and regex pattern and the parser to use.
# The special key "page1" is to search in the first page with a regexp.
# Use the route in the correct order, as the first matching route is used.
# Add a default route ("default", {}, parser) at the end to catch all PDFs.
#
# Sample:
# ```python
# from langchain_community.document_loaders import PyPDFLoader
# from langchain_community.document_loaders.parsers.pdf import PyMuPDFParser
# from langchain_community.document_loaders.parsers.pdf import PyPDFium2Parser
# from langchain_community.document_loaders.parsers import PDFPlumberParser
# routes = [
# # Name, keys with regex, parser
# ("Microsoft", {"producer": "Microsoft", "creator": "Microsoft"},
# PyMuPDFParser()),
# ("LibreOffice", {"producer": "LibreOffice", }, PDFPlumberParser()),
# ("Xdvipdfmx", {"producer": "xdvipdfmx.*", "page1":"Hello"},
# PDFPlumberParser()),
# ("defautl", {}, PyPDFium2Parser())
# ]
# loader = PDFRouterLoader(filename, routes)
# loader.load()
# ```
# """
#
# def __init__(
# self,
# file_path: Union[str, Path],
# *,
# routes: list[
# tuple[
# str,
# dict[str, Union[re.Pattern, str]],
# BaseBlobParser,
# ]
# ],
# password: Optional[str] = None,
# ):
# """Initialize with a file path."""
# super().__init__(file_path)
# self.parser = PDFRouterParser(routes, password=password)
#
# def lazy_load(
# self,
# ) -> Iterator[Document]:
# if self.web_path:
# blob = Blob.from_data(open(self.file_path, "rb").read(),
# path=self.web_path) # type: ignore[attr-defined]
# else:
# blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
# yield from self.parser.lazy_parse(blob)
# FIXME
Original file line number Diff line number Diff line change
@@ -2,7 +2,7 @@

import re
from pathlib import Path
from typing import TYPE_CHECKING, Iterator
from typing import TYPE_CHECKING, Iterator, Literal, Union, cast

import pytest

@@ -11,7 +11,11 @@
from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.document_loaders.parsers import (
BaseImageBlobParser,
PDFMinerParser,
PDFPlumberParser,
PDFRouterParser,
PyMuPDFParser,
PyPDFium2Parser,
)

if TYPE_CHECKING:
@@ -312,3 +316,37 @@ def _analyze_image(self, img: Image) -> str:
**params,
)
_std_assert_with_parser(parser)


def test_parser_router_parse() -> None:
mode: Literal["single"] = "single"
routes: PDFRouterParser.Routes = [
(
"Xdvipdfmx",
{"producer": re.compile(r"xdvipdfmx.*"), "page1": "Hello"},
PDFMinerParser(mode=mode),
),
(
"Microsoft",
{"producer": "Microsoft", "creator": "Microsoft"},
PyMuPDFParser(mode=mode),
),
(
"LibreOffice",
{
"producer": "LibreOffice",
},
PDFMinerParser(mode=mode),
),
(
"default",
cast(dict[str, Union[re.Pattern, str]], dict()),
PyPDFium2Parser(mode=mode),
),
]
_assert_with_parser(
PDFRouterParser(
routes=routes,
),
splits_by_page=False,
)
Original file line number Diff line number Diff line change
@@ -14,6 +14,7 @@ def test_parsers_public_api_correct() -> None:
"OpenAIWhisperParser",
"PyPDFParser",
"PDFMinerParser",
"PDFRouterParser",
"PyMuPDFParser",
"PyPDFium2Parser",
"PDFPlumberParser",