From 6173699b4435a8d8a62e2e114225029fffda0c56 Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Mon, 8 Jul 2024 18:25:54 +0200 Subject: [PATCH 1/4] Validate xref position explicitly --- pdfminer/pdfdocument.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/pdfminer/pdfdocument.py b/pdfminer/pdfdocument.py index bcacae6b..6aa2906a 100644 --- a/pdfminer/pdfdocument.py +++ b/pdfminer/pdfdocument.py @@ -950,19 +950,28 @@ def get_dest(self, name: Union[str, bytes]) -> Any: def find_xref(self, parser: PDFParser) -> int: """Internal function used to locate the first XRef.""" # search the last xref table by scanning the file backwards. - prev = None + prev = b"" for line in parser.revreadlines(): line = line.strip() log.debug("find_xref: %r", line) + if line == b"startxref": - break + log.debug("xref found: pos=%r", prev) + + if not prev.isdigit(): + raise PDFNoValidXRef(f"Invalid xref position: {prev}") + + start = int(prev) + + if not start >= 0: + raise PDFNoValidXRef(f"Invalid negative xref position: {start}") + + return start + if line: prev = line - else: - raise PDFNoValidXRef("Unexpected EOF") - log.debug("xref found: pos=%r", prev) - assert prev is not None - return int(prev) + + raise PDFNoValidXRef("Unexpected EOF") # read xref table def read_xref_from( From 8cc9c51ca18ad2246ec84d0d9ed179b1ff7cd364 Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Mon, 8 Jul 2024 18:27:52 +0200 Subject: [PATCH 2/4] Update CHANGELOG.md --- CHANGELOG.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 78eeea95..8a1f128a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,8 +11,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Fixed -- `TypeError` when PDF object reference cannot be parsed as int ([#972](https://github.com/pdfminer/pdfminer.six/pull/972))]) -- `TypeError` when PDF literal cannot be converted to str ([#978](https://github.com/pdfminer/pdfminer.six/pull/978)) +- `TypeError` when corrupt PDF object reference cannot be parsed as int ([#972](https://github.com/pdfminer/pdfminer.six/pull/972))]) +- `TypeError` when corrupt PDF literal cannot be converted to str ([#978](https://github.com/pdfminer/pdfminer.six/pull/978)) +- `ValueError` when corrupt PDF specifies a negative xref location ([#980](http://github.com/pdfminer/pdfminer.six/pull/980)) ### Removed From 2cb3c91696037448653c56d3d6928b211e35a609 Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Tue, 9 Jul 2024 06:45:55 +0200 Subject: [PATCH 3/4] Change constant `2` for more readable `io.SEEK_END` --- pdfminer/psparser.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pdfminer/psparser.py b/pdfminer/psparser.py index 0839f1f0..8bebaf55 100755 --- a/pdfminer/psparser.py +++ b/pdfminer/psparser.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 +import io # -*- coding: utf-8 -*- @@ -260,7 +261,7 @@ def revreadlines(self) -> Iterator[bytes]: This is used to locate the trailers at the end of a file. """ - self.fp.seek(0, 2) + self.fp.seek(0, io.SEEK_END) pos = self.fp.tell() buf = b"" while 0 < pos: From 47f20f2c63eb0668168aed769302b4caf5e50c04 Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Tue, 9 Jul 2024 06:51:56 +0200 Subject: [PATCH 4/4] Fix formatting bytes in error message --- pdfminer/pdfdocument.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdfminer/pdfdocument.py b/pdfminer/pdfdocument.py index 6aa2906a..84713516 100644 --- a/pdfminer/pdfdocument.py +++ b/pdfminer/pdfdocument.py @@ -959,7 +959,7 @@ def find_xref(self, parser: PDFParser) -> int: log.debug("xref found: pos=%r", prev) if not prev.isdigit(): - raise PDFNoValidXRef(f"Invalid xref position: {prev}") + raise PDFNoValidXRef(f"Invalid xref position: {prev!r}") start = int(prev)