From 3629ade9db036bb94fa1d4551dcb5c1ebb2a241f Mon Sep 17 00:00:00 2001
From: J08nY <johny@neuromancer.sk>
Date: Wed, 9 Nov 2022 13:20:24 +0100
Subject: [PATCH] Fix OCR page order issue.

Fixes #279.
---
 sec_certs/utils/pdf.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sec_certs/utils/pdf.py b/sec_certs/utils/pdf.py
index a40fb18d..8f0e6abf 100644
--- a/sec_certs/utils/pdf.py
+++ b/sec_certs/utils/pdf.py
@@ -58,7 +58,8 @@ def ocr_pdf_file(pdf_path: Path) -> str:
             if tes.returncode != 0:
                 raise ValueError(f"tesseract failed: {tes.returncode}")
         contents = ""
-        for txt_path in map(Path, glob.glob(str(tmppath / "image*.txt"))):
+        txt_paths = list(glob.glob(str(tmppath / "image*.txt")))
+        for txt_path in map(Path, sorted(txt_paths, key=lambda fname: int(fname[6:-4]))):
             with txt_path.open("r", encoding="utf-8") as f:
                 contents += f.read()
     return contents