Skip to content
This repository has been archived by the owner on Jun 14, 2018. It is now read-only.

Update pyocr to use psm_parameter based on tesseract version #102

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
9 changes: 6 additions & 3 deletions src/pyocr/builders.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,8 +304,9 @@ class TextBuilder(BaseBuilder):

def __init__(self, tesseract_layout=3, cuneiform_dotmatrix=False,
cuneiform_fax=False, cuneiform_singlecolumn=False):
from .tesseract import psm_parameter
tess_flags = [psm_parameter(), str(tesseract_layout)]
file_ext = ["txt"]
tess_flags = ["-psm", str(tesseract_layout)]
cun_args = ["-f", "text"]
# Add custom cuneiform parameters if needed
for par, arg in [(cuneiform_dotmatrix, "--dotmatrix"),
Expand Down Expand Up @@ -561,8 +562,9 @@ class WordBoxBuilder(BaseBuilder):
"""

def __init__(self, tesseract_layout=1):
from .tesseract import psm_parameter
tess_flags = [psm_parameter(), str(tesseract_layout)]
file_ext = ["html", "hocr"]
tess_flags = ["-psm", str(tesseract_layout)]
tess_conf = ["hocr"]
cun_args = ["-f", "hocr"]
super(WordBoxBuilder, self).__init__(file_ext, tess_flags, tess_conf,
Expand Down Expand Up @@ -637,8 +639,9 @@ class LineBoxBuilder(BaseBuilder):
"""

def __init__(self, tesseract_layout=1):
from .tesseract import psm_parameter
tess_flags = [psm_parameter(), str(tesseract_layout)]
file_ext = ["html", "hocr"]
tess_flags = ["-psm", str(tesseract_layout)]
tess_conf = ["hocr"]
cun_args = ["-f", "hocr"]
super(LineBoxBuilder, self).__init__(file_ext, tess_flags, tess_conf,
Expand Down
8 changes: 7 additions & 1 deletion src/pyocr/tesseract.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,12 @@ def can_detect_orientation():
)


def psm_parameter():
"""Return the psm option string depending on the Tesseract version."""
version = get_version()
return "--psm" if version[0] > 3 else "-psm"


def detect_orientation(image, lang=None):
"""
Arguments:
Expand All @@ -178,7 +184,7 @@ def detect_orientation(image, lang=None):
"""
_set_environment()
with temp_dir() as tmpdir:
command = [TESSERACT_CMD, "input.bmp", 'stdout', "-psm", "0"]
command = [TESSERACT_CMD, "input.bmp", 'stdout', psm_parameter(), "0"]
version = get_version()
if version[0] >= 4:
# XXX: temporary fix to remove once Tesseract 4 is stable
Expand Down