From b95bdbcca270889527159b3b2a5a4e20a7738a57 Mon Sep 17 00:00:00 2001 From: David Martin Date: Mon, 11 Jun 2018 16:30:49 +1000 Subject: [PATCH] Determine the psm parameter based on the Tesseract version. It turns out that for versions before the current 4 beta only '-psm' is allowed, and the latest build only allows '--psm'. --- src/pyocr/builders.py | 7 ++++--- src/pyocr/tesseract.py | 11 ++++++++++- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/pyocr/builders.py b/src/pyocr/builders.py index 5959a7c..f27b829 100644 --- a/src/pyocr/builders.py +++ b/src/pyocr/builders.py @@ -14,6 +14,7 @@ import xml.dom.minidom import logging +from .tesseract import psm_parameter from .util import to_unicode logger = logging.getLogger(__name__) @@ -305,7 +306,7 @@ class TextBuilder(BaseBuilder): def __init__(self, tesseract_layout=3, cuneiform_dotmatrix=False, cuneiform_fax=False, cuneiform_singlecolumn=False): file_ext = ["txt"] - tess_flags = ["--psm", str(tesseract_layout)] + tess_flags = [psm_parameter(), str(tesseract_layout)] cun_args = ["-f", "text"] # Add custom cuneiform parameters if needed for par, arg in [(cuneiform_dotmatrix, "--dotmatrix"), @@ -562,7 +563,7 @@ class WordBoxBuilder(BaseBuilder): def __init__(self, tesseract_layout=1): file_ext = ["html", "hocr"] - tess_flags = ["--psm", str(tesseract_layout)] + tess_flags = [psm_parameter(), str(tesseract_layout)] tess_conf = ["hocr"] cun_args = ["-f", "hocr"] super(WordBoxBuilder, self).__init__(file_ext, tess_flags, tess_conf, @@ -638,7 +639,7 @@ class LineBoxBuilder(BaseBuilder): def __init__(self, tesseract_layout=1): file_ext = ["html", "hocr"] - tess_flags = ["--psm", str(tesseract_layout)] + tess_flags = [psm_parameter(), str(tesseract_layout)] tess_conf = ["hocr"] cun_args = ["-f", "hocr"] super(LineBoxBuilder, self).__init__(file_ext, tess_flags, tess_conf, diff --git a/src/pyocr/tesseract.py b/src/pyocr/tesseract.py index 5b8e002..c1329e0 100755 --- a/src/pyocr/tesseract.py +++ b/src/pyocr/tesseract.py @@ -161,6 +161,15 @@ def can_detect_orientation(): ) +def psm_parameter(): + """Return the psm option string depending on the Tesseract version.""" + version = get_version() + if version[0] <= 3: + return "-psm" + + return "--psm" + + def detect_orientation(image, lang=None): """ Arguments: @@ -178,7 +187,7 @@ def detect_orientation(image, lang=None): """ _set_environment() with temp_dir() as tmpdir: - command = [TESSERACT_CMD, "input.bmp", 'stdout', "--psm", "0"] + command = [TESSERACT_CMD, "input.bmp", 'stdout', psm_parameter(), "0"] version = get_version() if version[0] >= 4: # XXX: temporary fix to remove once Tesseract 4 is stable