Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit cbdaac0

Browse files
committedApr 15, 2025·
Propose PDFRouterParser and Loader
1 parent ed5c480 commit cbdaac0

File tree

4 files changed

+178
-3
lines changed

4 files changed

+178
-3
lines changed
 

‎libs/community/extended_testing_deps.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ openapi-pydantic>=0.3.2,<0.4
5959
oracle-ads>=2.9.1,<3
6060
oracledb>=2.2.0,<3
6161
pandas>=2.0.1,<3
62-
pdfminer-six==20231228
62+
pdfminer-six==20250327
6363
pdfplumber>=0.11
6464
pgvector>=0.1.6,<0.2
6565
playwright>=1.48.0,<2

‎libs/community/langchain_community/document_loaders/parsers/pdf.py

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from __future__ import annotations
44

5+
import re
56
import html
67
import io
78
import logging
@@ -1668,3 +1669,91 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty
16681669
docs = self._generate_docs(blob, result)
16691670

16701671
yield from docs
1672+
1673+
class PDFRouterParser(BaseBlobParser):
1674+
"""
1675+
Load PDFs using different parsers based on the metadata of the PDF
1676+
or the body of the first page.
1677+
The routes are defined as a list of tuples, where each tuple contains
1678+
the name, a dictionary of metadata and regex pattern and the parser to use.
1679+
The special key "page1" is to search in the first page with a regexp.
1680+
Use the route in the correct order, as the first matching route is used.
1681+
Add a default route ("default", {}, parser) at the end to catch all PDFs.
1682+
This code is similar to `MimeTypeBasedParser`, but on the content of the PDF file.
1683+
1684+
Sample:
1685+
```python
1686+
from langchain_community.document_loaders import PyPDFLoader
1687+
from langchain_community.document_loaders.parsers.pdf import PyMuPDFParser
1688+
from langchain_community.document_loaders.parsers.pdf import PyPDFium2Parser
1689+
from langchain_community.document_loaders.parsers import PDFPlumberParser
1690+
routes = [
1691+
# Name, keys with regex, parser
1692+
("Microsoft", {"producer": "Microsoft", "creator": "Microsoft"},
1693+
PyMuPDFParser()),
1694+
("LibreOffice", {"producer": "LibreOffice", }, PDFPlumberParser()),
1695+
("Xdvipdfmx", {"producer": "xdvipdfmx.*", "page1":"Hello"}, PDFPlumberParser()),
1696+
("defautl", {}, PyPDFium2Parser())
1697+
]
1698+
loader = PDFRouterLoader(filename, routes)
1699+
loader.load()
1700+
```
1701+
"""
1702+
1703+
def __init__(
1704+
self,
1705+
routes: list[
1706+
tuple[
1707+
str,
1708+
dict[str, Union[re.Pattern | str]],
1709+
BaseBlobParser,
1710+
]
1711+
],
1712+
*,
1713+
password: Optional[str] = None,
1714+
):
1715+
try:
1716+
import pypdf # noqa:F401
1717+
except ImportError:
1718+
raise ImportError(
1719+
"pypdf package not found, please install it with `pip install pypdf`"
1720+
)
1721+
super().__init__()
1722+
self.password = password
1723+
new_routes = []
1724+
for name, matchs, parser in routes:
1725+
new_matchs = {}
1726+
for k, v in matchs.items():
1727+
if isinstance(v, str):
1728+
v = re.compile(v)
1729+
new_matchs[k] = v
1730+
new_routes.append((name, new_matchs, parser))
1731+
self.routes = new_routes
1732+
1733+
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
1734+
"""Lazily parse the blob."""
1735+
try:
1736+
import pypdf # noqa:F401
1737+
except ImportError:
1738+
raise ImportError(
1739+
"pypdf package not found, please install it with `pip install pypdf.six`"
1740+
)
1741+
from pypdf import PdfReader
1742+
1743+
with blob.as_bytes_io() as pdf_file_obj: # type: ignore[attr-defined]
1744+
with PdfReader(pdf_file_obj, password=self.password) as reader:
1745+
metadata = _purge_metadata(cast(dict[str, Any], reader.metadata))
1746+
page1 = reader.pages[0].extract_text()
1747+
metadata["page1"] = page1
1748+
find = False
1749+
for name, match, parser in self.routes:
1750+
for k, p in match.items():
1751+
if k not in metadata or not p.search(metadata[k]):
1752+
break
1753+
else:
1754+
find = True
1755+
break
1756+
if find:
1757+
for doc in parser.lazy_parse(blob):
1758+
doc.metadata["router"] = name
1759+
yield doc

‎libs/community/langchain_community/document_loaders/pdf.py

Lines changed: 61 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
from urllib.parse import urlparse
2323

2424
import requests
25+
26+
from langchain_core.document_loaders import BaseBlobParser
2527
from langchain_core.documents import Document
2628
from langchain_core.utils import get_from_dict_or_env
2729

@@ -37,7 +39,7 @@
3739
PDFPlumberParser,
3840
PyMuPDFParser,
3941
PyPDFium2Parser,
40-
PyPDFParser,
42+
PyPDFParser, PDFRouterParser,
4143
)
4244
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
4345

@@ -1423,3 +1425,61 @@ def lazy_load(self) -> Iterator[Document]:
14231425

14241426
# Legacy: only for backwards compatibility. Use PyPDFLoader instead
14251427
PagedPDFSplitter = PyPDFLoader
1428+
1429+
class PDFRouterLoader(BasePDFLoader):
1430+
"""
1431+
Load PDFs using different parsers based on the metadata of the PDF
1432+
or the body of the first page.
1433+
The routes are defined as a list of tuples, where each tuple contains
1434+
the name, a dictionary of metadata and regex pattern and the parser to use.
1435+
The special key "page1" is to search in the first page with a regexp.
1436+
Use the route in the correct order, as the first matching route is used.
1437+
Add a default route ("default", {}, parser) at the end to catch all PDFs.
1438+
1439+
Sample:
1440+
```python
1441+
from langchain_community.document_loaders import PyPDFLoader
1442+
from langchain_community.document_loaders.parsers.pdf import PyMuPDFParser
1443+
from langchain_community.document_loaders.parsers.pdf import PyPDFium2Parser
1444+
from langchain_community.document_loaders.parsers import PDFPlumberParser
1445+
routes = [
1446+
# Name, keys with regex, parser
1447+
("Microsoft", {"producer": "Microsoft", "creator": "Microsoft"},
1448+
PyMuPDFParser()),
1449+
("LibreOffice", {"producer": "LibreOffice", }, PDFPlumberParser()),
1450+
("Xdvipdfmx", {"producer": "xdvipdfmx.*", "page1":"Hello"}, PDFPlumberParser()),
1451+
("defautl", {}, PyPDFium2Parser())
1452+
]
1453+
loader = PDFRouterLoader(filename, routes)
1454+
loader.load()
1455+
```
1456+
"""
1457+
1458+
def __init__(
1459+
self,
1460+
file_path: Union[str, Path],
1461+
*,
1462+
routes: list[
1463+
tuple[
1464+
str,
1465+
dict[str, Union[re.Pattern | str]],
1466+
BaseBlobParser,
1467+
]
1468+
],
1469+
password: Optional[str] = None,
1470+
):
1471+
"""Initialize with a file path."""
1472+
super().__init__(file_path)
1473+
self.parser = PDFRouterParser(routes, password=password)
1474+
1475+
1476+
def lazy_load(
1477+
self,
1478+
) -> Iterator[Document]:
1479+
if self.web_path:
1480+
blob = Blob.from_data(
1481+
open(self.file_path, "rb").read(), path=self.web_path
1482+
) # type: ignore[attr-defined]
1483+
else:
1484+
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
1485+
yield from self.parser.lazy_parse(blob)

‎libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,10 @@
1111
from langchain_community.document_loaders.blob_loaders import Blob
1212
from langchain_community.document_loaders.parsers import (
1313
BaseImageBlobParser,
14-
PDFPlumberParser,
14+
PDFPlumberParser, PyMuPDFParser, PyPDFium2Parser,
1515
)
16+
from langchain_community.document_loaders.parsers.pdf import PDFRouterParser, \
17+
PDFMinerParser
1618

1719
if TYPE_CHECKING:
1820
from PIL.Image import Image
@@ -312,3 +314,27 @@ def _analyze_image(self, img: Image) -> str:
312314
**params,
313315
)
314316
_std_assert_with_parser(parser)
317+
318+
def test_parser_router_parse() -> None:
319+
mode = "single"
320+
routes = [
321+
(
322+
"Microsoft",
323+
{"producer": "Microsoft", "creator": "Microsoft"},
324+
PyMuPDFParser(mode=mode),
325+
),
326+
(
327+
"LibreOffice",
328+
{
329+
"producer": "LibreOffice",
330+
},
331+
PDFMinerParser(mode=mode),
332+
),
333+
(
334+
"Xdvipdfmx",
335+
{"producer": "xdvipdfmx.*", "page1": "Hello"},
336+
PDFMinerParser(mode=mode),
337+
),
338+
("default", {}, PyPDFium2Parser(mode=mode)),
339+
]
340+
_assert_with_parser(PDFRouterParser(routes=routes), splits_by_page=False)

0 commit comments

Comments
 (0)
Please sign in to comment.