From 3629ade9db036bb94fa1d4551dcb5c1ebb2a241f Mon Sep 17 00:00:00 2001 From: J08nY Date: Wed, 9 Nov 2022 13:20:24 +0100 Subject: [PATCH] Fix OCR page order issue. Fixes #279. --- sec_certs/utils/pdf.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sec_certs/utils/pdf.py b/sec_certs/utils/pdf.py index a40fb18d..8f0e6abf 100644 --- a/sec_certs/utils/pdf.py +++ b/sec_certs/utils/pdf.py @@ -58,7 +58,8 @@ def ocr_pdf_file(pdf_path: Path) -> str: if tes.returncode != 0: raise ValueError(f"tesseract failed: {tes.returncode}") contents = "" - for txt_path in map(Path, glob.glob(str(tmppath / "image*.txt"))): + txt_paths = list(glob.glob(str(tmppath / "image*.txt"))) + for txt_path in map(Path, sorted(txt_paths, key=lambda fname: int(fname[6:-4]))): with txt_path.open("r", encoding="utf-8") as f: contents += f.read() return contents