Skip to content

Commit

Permalink
Runner for olmocr bench
Browse files Browse the repository at this point in the history
  • Loading branch information
jakep-allenai committed Feb 19, 2025
1 parent a8f6921 commit 4e0339f
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 1 deletion.
83 changes: 83 additions & 0 deletions olmocr/bench/runners/run_gotocr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import os
import argparse
import tempfile
import base64
import torch

from olmocr.data.renderpdf import render_pdf_to_base64png

from transformers import AutoModel, AutoTokenizer

# Load GOT-OCR model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(
'ucaslcl/GOT-OCR2_0', trust_remote_code=True
)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModel.from_pretrained(
'ucaslcl/GOT-OCR2_0',
trust_remote_code=True,
use_safetensors=True,
revision="979938bf89ccdc949c0131ddd3841e24578a4742",
pad_token_id=tokenizer.eos_token_id
)
model = model.eval().to(device)


def run(pdf_folder):
"""
Convert all PDF files in the specified folder to markdown using GOT-OCR.
Each page of a PDF is converted to an image and processed with OCR.
The markdown files are saved in a folder called "marker" located alongside the pdf_folder.
:param pdf_folder: Path to the folder containing PDF files.
"""
# Resolve absolute paths and prepare destination folder
pdf_folder = os.path.abspath(pdf_folder)
parent_dir = os.path.dirname(pdf_folder)
destination_folder = os.path.join(parent_dir, "marker")
os.makedirs(destination_folder, exist_ok=True)

# List all PDF files in the folder
pdf_files = [
os.path.join(pdf_folder, filename)
for filename in os.listdir(pdf_folder)
if filename.lower().endswith(".pdf")
]

for pdf_path in pdf_files:
print(f"Processing {pdf_path} ...")

base64image = render_pdf_to_base64png(pdf_path, page_num=1, target_longest_image_dim=1024)

# Save the image temporarily as a JPEG file
with tempfile.NamedTemporaryFile("wb", suffix=".png", delete=False) as tmp:
tmp.write(base64.b64decode(base64image))

# Run GOT-OCR on the saved image
# The OCR result is assumed to be a plain text string.
res = model.chat(tokenizer, tmp.name, ocr_type='ocr')

# Clean up the temporary image file
os.remove(tmp.name)

# Create the markdown filename by replacing .pdf with .md
file_name = os.path.basename(pdf_path).replace('.pdf', '.md')
output_path = os.path.join(destination_folder, file_name)

with open(output_path, "w", encoding="utf-8") as fout:
fout.write(res)

print(f"Saved markdown to {output_path}")


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Convert all PDF files in a folder to markdown using GOT-OCR and save them to a sibling 'marker' folder."
)
parser.add_argument(
"pdf_folder",
type=str,
help="Path to the folder containing PDF files (e.g., '/path/to/pdfs')"
)
args = parser.parse_args()
run(args.pdf_folder)
1 change: 0 additions & 1 deletion olmocr/bench/runners/run_marker.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ def run(pdf_folder):
artifact_dict=create_model_dict(),
)


for pdf_path in pdf_files:
rendered = converter(pdf_path)
# Create the markdown filename by replacing the .pdf extension with .md
Expand Down

0 comments on commit 4e0339f

Please sign in to comment.