Runner for olmocr bench

allenai · Feb 19, 2025 · 4e0339f · 4e0339f
1 parent a8f6921
commit 4e0339f
Show file tree

Hide file tree

Showing 2 changed files with 83 additions and 1 deletion.
diff --git a/olmocr/bench/runners/run_gotocr.py b/olmocr/bench/runners/run_gotocr.py
@@ -0,0 +1,83 @@
+import os
+import argparse
+import tempfile
+import base64
+import torch
+
+from olmocr.data.renderpdf import render_pdf_to_base64png
+
+from transformers import AutoModel, AutoTokenizer
+
+# Load GOT-OCR model and tokenizer
+tokenizer = AutoTokenizer.from_pretrained(
+    'ucaslcl/GOT-OCR2_0', trust_remote_code=True
+)
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model = AutoModel.from_pretrained(
+    'ucaslcl/GOT-OCR2_0',
+    trust_remote_code=True,
+    use_safetensors=True,
+    revision="979938bf89ccdc949c0131ddd3841e24578a4742",
+    pad_token_id=tokenizer.eos_token_id
+)
+model = model.eval().to(device)
+
+
+def run(pdf_folder):
+    """
+    Convert all PDF files in the specified folder to markdown using GOT-OCR.
+    Each page of a PDF is converted to an image and processed with OCR.
+    The markdown files are saved in a folder called "marker" located alongside the pdf_folder.
+    
+    :param pdf_folder: Path to the folder containing PDF files.
+    """
+    # Resolve absolute paths and prepare destination folder
+    pdf_folder = os.path.abspath(pdf_folder)
+    parent_dir = os.path.dirname(pdf_folder)
+    destination_folder = os.path.join(parent_dir, "marker")
+    os.makedirs(destination_folder, exist_ok=True)
+
+    # List all PDF files in the folder
+    pdf_files = [
+        os.path.join(pdf_folder, filename)
+        for filename in os.listdir(pdf_folder)
+        if filename.lower().endswith(".pdf")
+    ]
+
+    for pdf_path in pdf_files:
+        print(f"Processing {pdf_path} ...")
+
+        base64image = render_pdf_to_base64png(pdf_path, page_num=1, target_longest_image_dim=1024)
+
+        # Save the image temporarily as a JPEG file
+        with tempfile.NamedTemporaryFile("wb", suffix=".png", delete=False) as tmp:
+            tmp.write(base64.b64decode(base64image))
+
+        # Run GOT-OCR on the saved image
+        # The OCR result is assumed to be a plain text string.
+        res = model.chat(tokenizer, tmp.name, ocr_type='ocr')
+
+        # Clean up the temporary image file
+        os.remove(tmp.name)
+
+        # Create the markdown filename by replacing .pdf with .md
+        file_name = os.path.basename(pdf_path).replace('.pdf', '.md')
+        output_path = os.path.join(destination_folder, file_name)
+
+        with open(output_path, "w", encoding="utf-8") as fout:
+            fout.write(res)
+
+        print(f"Saved markdown to {output_path}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Convert all PDF files in a folder to markdown using GOT-OCR and save them to a sibling 'marker' folder."
+    )
+    parser.add_argument(
+        "pdf_folder",
+        type=str,
+        help="Path to the folder containing PDF files (e.g., '/path/to/pdfs')"
+    )
+    args = parser.parse_args()
+    run(args.pdf_folder)
diff --git a/olmocr/bench/runners/run_marker.py b/olmocr/bench/runners/run_marker.py
@@ -34,7 +34,6 @@ def run(pdf_folder):
         artifact_dict=create_model_dict(),
     )
 
-
     for pdf_path in pdf_files:
         rendered = converter(pdf_path)
         # Create the markdown filename by replacing the .pdf extension with .md