-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathocr_pdf.py
81 lines (69 loc) · 3.02 KB
/
ocr_pdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import argparse
import os
import cv2
import numpy as np
from pdf2image import convert_from_path
import pytesseract
from concurrent.futures import ProcessPoolExecutor
from PIL import Image, ImageOps, ImageEnhance
DEBUG = False
DPI = 150 # Try to match source document resolution
CONTRAST = 0.9 # lower to remove noise, higher than one to increase contrast
LANG = "eng+fra"
MAX_WORKERS = 16
def ocr_page(args):
i, image = args
print(f"Processing page {i} ...")
# convert Pillow image to grayscale
grayscale_image = ImageOps.grayscale(image)
# blow out see-through text
# increase brightness
brightness_enhancer = ImageEnhance.Brightness(grayscale_image)
brightness = 1.0
if CONTRAST < brightness:
brightness = 2 - CONTRAST # 1.15 to 1.7 seems to work best
brightened_image = brightness_enhancer.enhance(brightness)
# decrease contrast
contrast_enhancer = ImageEnhance.Contrast(brightened_image)
faded_image = contrast_enhancer.enhance(CONTRAST)
# blur image slightly to smooth out noise
image_np = np.array(faded_image)
blurred_np = cv2.medianBlur(image_np, 1)
processed_image = Image.fromarray(blurred_np)
processed_image.info['dpi'] = (DPI, DPI)
ocr_text = pytesseract.image_to_string(processed_image, lang=LANG, config=f"--dpi {DPI}")
return (i + 1, ocr_text) # Return page number and OCR text as a tuple
def ocr_pdf(input_path):
print("Extracting images from pdf...")
if DEBUG: # Extract only the first 30 images
images = convert_from_path(input_path, dpi=DPI, first_page=1, last_page=30)
else: # Extract all images
images = convert_from_path(input_path, dpi=DPI)
print(f"{len(images)} pages found. Starting OCR.")
with ProcessPoolExecutor(max_workers=MAX_WORKERS) as executor:
ocr_texts = list(executor.map(ocr_page, enumerate(images)))
output_txt_path = os.path.splitext(input_path)[0] + ".txt"
with open(output_txt_path, 'w') as f:
for page_num, text in ocr_texts:
f.write(f"Page {page_num}:\n{text}\n\n") # Write page number and OCR text to the file
print(f"OCR completed and saved to {output_txt_path}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Process images from input folder and save to output folder.")
parser.add_argument("input_path", type=str, help="Path to the input folder containing images")
parser.add_argument("--dpi", type=int, help="DPI setting for pdf OCR (optional)")
parser.add_argument("--contrast", type=float, help="less than 1.0 to remove faint noise (optional)")
parser.add_argument("--lang", type=str, default='eng+fra', help="Tesseract language string for OCR (optional)")
parser.add_argument("--debug", action='store_true', help="Enable debug mode (optional)")
args = parser.parse_args()
DEBUG = args.debug
if args.dpi is not None:
DPI = args.dpi
if args.contrast is not None:
CONTRAST = args.contrast
if args.lang is not None:
LANG = args.lang
if not os.path.isfile(args.input_path):
print(f"Error: File '{args.input_path}' not found.")
else:
ocr_pdf(input_path=args.input_path)