Skip to content
This repository has been archived by the owner on Jun 14, 2018. It is now read-only.

Commit

Permalink
Determine the psm parameter based on the Tesseract version.
Browse files Browse the repository at this point in the history
It turns out that for versions before the current 4 beta only '-psm' is
allowed, and the latest build only allows '--psm'.
  • Loading branch information
ddddavidmartin committed Jun 11, 2018
1 parent c136838 commit b95bdbc
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 4 deletions.
7 changes: 4 additions & 3 deletions src/pyocr/builders.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import xml.dom.minidom
import logging

from .tesseract import psm_parameter
from .util import to_unicode

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -305,7 +306,7 @@ class TextBuilder(BaseBuilder):
def __init__(self, tesseract_layout=3, cuneiform_dotmatrix=False,
cuneiform_fax=False, cuneiform_singlecolumn=False):
file_ext = ["txt"]
tess_flags = ["--psm", str(tesseract_layout)]
tess_flags = [psm_parameter(), str(tesseract_layout)]
cun_args = ["-f", "text"]
# Add custom cuneiform parameters if needed
for par, arg in [(cuneiform_dotmatrix, "--dotmatrix"),
Expand Down Expand Up @@ -562,7 +563,7 @@ class WordBoxBuilder(BaseBuilder):

def __init__(self, tesseract_layout=1):
file_ext = ["html", "hocr"]
tess_flags = ["--psm", str(tesseract_layout)]
tess_flags = [psm_parameter(), str(tesseract_layout)]
tess_conf = ["hocr"]
cun_args = ["-f", "hocr"]
super(WordBoxBuilder, self).__init__(file_ext, tess_flags, tess_conf,
Expand Down Expand Up @@ -638,7 +639,7 @@ class LineBoxBuilder(BaseBuilder):

def __init__(self, tesseract_layout=1):
file_ext = ["html", "hocr"]
tess_flags = ["--psm", str(tesseract_layout)]
tess_flags = [psm_parameter(), str(tesseract_layout)]
tess_conf = ["hocr"]
cun_args = ["-f", "hocr"]
super(LineBoxBuilder, self).__init__(file_ext, tess_flags, tess_conf,
Expand Down
11 changes: 10 additions & 1 deletion src/pyocr/tesseract.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,15 @@ def can_detect_orientation():
)


def psm_parameter():
"""Return the psm option string depending on the Tesseract version."""
version = get_version()
if version[0] <= 3:
return "-psm"

return "--psm"


def detect_orientation(image, lang=None):
"""
Arguments:
Expand All @@ -178,7 +187,7 @@ def detect_orientation(image, lang=None):
"""
_set_environment()
with temp_dir() as tmpdir:
command = [TESSERACT_CMD, "input.bmp", 'stdout', "--psm", "0"]
command = [TESSERACT_CMD, "input.bmp", 'stdout', psm_parameter(), "0"]
version = get_version()
if version[0] >= 4:
# XXX: temporary fix to remove once Tesseract 4 is stable
Expand Down

0 comments on commit b95bdbc

Please sign in to comment.