-
Notifications
You must be signed in to change notification settings - Fork 6
/
tesseract_ocr.pyx
104 lines (86 loc) · 3.32 KB
/
tesseract_ocr.pyx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import os
from cpython cimport *
from interface cimport *
from cpython.version cimport PY_MAJOR_VERSION
TESSDATA_POSSIBLE_PATHS = [
"/usr/local/share/tessdata",
"/usr/share/tessdata",
"/usr/share/tesseract/tessdata",
"/usr/local/share/tesseract-ocr/tessdata",
"/usr/share/tesseract-ocr/tessdata",
"/app/vendor/tesseract-ocr/tessdata", # Heroku
"/opt/local/share/tessdata", # OSX MacPorts
]
TESSDATA_EXTENSION = ".traineddata"
cdef unicode u(s):
if isinstance(s, unicode):
# an evil cast to <unicode> might work here in some(!) cases,
# depending on what the further processing does. to be safe,
# we can always create a copy instead
return unicode(s)
elif isinstance(s, bytes):
# only accept byte strings in Python 2.x, not in Py3
return (<bytes>s).decode('utf-8')
else:
raise TypeError(type(s))
cdef bytes b(s):
if PY_MAJOR_VERSION > 3 and isinstance(s, str):
return s.encode('utf-8')
elif type(s) is unicode:
return s.encode('utf-8')
else:
return s
cdef class Tesseract:
cdef TessBaseAPI _api
cdef char *tessdata_prefix
cdef lang
def __cinit__(self, tessdata_prefix=None, lang='eng'):
self._api = TessBaseAPI()
self.tessdata_prefix = NULL
if tessdata_prefix:
_tessdata_prefix = b(tessdata_prefix)
self.tessdata_prefix = _tessdata_prefix
self.lang = lang
cdef unicode _text_for_pix(self, Pix *image):
cdef char *outText
if self.tessdata_prefix:
lang_tessdata = os.path.join(self.tessdata_prefix, b('tessdata'), b(self.lang + TESSDATA_EXTENSION))
assert os.path.isfile(lang_tessdata), \
"Language %s is not available." % self.lang
else:
assert u(self.lang) in self.get_available_languages(), \
"Language %s is not available." % self.lang
if self._api.Init(self.tessdata_prefix, b(self.lang)) > 0:
raise RuntimeError("Could not initialize tesseract.")
self._api.SetImage(image)
outText = self._api.GetUTF8Text()
pixDestroy(&image)
return u(<bytes>outText).strip()
cpdef get_available_languages(self):
languages = []
for dirpath in TESSDATA_POSSIBLE_PATHS:
if not os.access(dirpath, os.R_OK):
continue
for filename in os.listdir(dirpath):
if filename.lower().endswith(TESSDATA_EXTENSION):
lang = filename[:len(TESSDATA_EXTENSION) * -1]
languages.append(lang)
return languages
cpdef unicode text_for_bytes(self, buf):
cdef Pix *image
image = pixReadMem(<bytes>buf, len(buf))
if not image:
raise RuntimeError('pixRead failed')
return self._text_for_pix(image)
cpdef unicode text_for_filename(self, filename):
cdef Pix *image
if not os.path.isfile(filename):
raise IOError("No such file or directory")
image = pixRead(b(filename))
return self._text_for_pix(image)
cpdef unicode text_for_filename(filename, lang='eng'):
api = Tesseract(None, lang)
return api.text_for_filename(filename)
cpdef unicode text_for_bytes(buf, lang='eng'):
api = Tesseract(None, lang)
return api.text_for_bytes(buf)