-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathautotrain.py
executable file
·228 lines (162 loc) · 6.83 KB
/
autotrain.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Autotrain is a script designed to help generate Tesseract training
# data from image/boxfile pairs.
#
# The order of execution is:
# * Work out the language and a list of fonts present
# * Generate .tr files from each boxfile
# * Concatenate all .tr and .box files for each font into
# single files
# * Run unicharset_extractor on the boxfiles
# * Run mftraining and cntraining
# * Rename the output files to include the language prefix
# * Run combine_tessdata on all the generated files
# * Move the lang.traineddata file to the tesseract directory.
# (you need to specify this directory in the script)
#
# You must run this program only for one language at a time.
#
# Because many Tesseract tools output files in the current working
# directory, you must run this script from the directory holding your
# images/boxfiles.
#
# You must run the script with high enough permissions to allow the file
# to be copied to the tesseract directory.
#
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
# MA 02110-1301, USA.
import subprocess
import os
import sys
import shutil
import codecs
class AutoTrainer:
def __init__(self):
self.tessdataDirectory = '/usr/local/share/tessdata'
def generate_tr_files(self):
for filename in self.baselist: #for every file in the directory
# generate the .tr files from the .tif/png + .box
cmd = ['tesseract', '%s%s'%(filename, self.ext), filename, 'nobatch', 'box.train.stderr']
subprocess.call(cmd)
def generate_unicharset(self):
print '\nGenerating unicharset'
cmd = ['unicharset_extractor']
cmd.extend(self.catBoxFileList)
print 'Running command:', cmd
subprocess.call(cmd)
def do_mftraining(self):
print '\nBeginning mftraining'
cmd = ['mftraining', '-U', 'unicharset', '-O', self.lang +'.unicharset']
cmd.extend(self.catTrFileList)
print 'Running command:', cmd
subprocess.call(cmd)
def do_cntraining(self):
print '\nBeginning cntraining'
cmd = ['cntraining']
cmd.extend(self.catTrFileList)
print 'Running command:', cmd
subprocess.call(cmd)
pass
def rename_files(self):
print '\nRenaming files'
for filename in ['normproto', 'Microfeat', 'inttemp', 'pffmtable']:
newFilename = '%s.%s' % (self.lang, filename)
shutil.move(filename, newFilename)
def combine_data(self):
print '\nCombining data'
cmd = ['combine_tessdata', self.lang + '.']
print 'Running cmd:' , cmd
subprocess.call(cmd)
def get_language(self):
self.lang = self.baselist[0].split('.')[0]
print '\nFound language: %s' % self.lang
def get_font_list(self):
self.fontList = []
for filename in self.baselist:
font = filename.split('.')[1]
if font not in self.fontList:
self.fontList.append(font)
print '\nFound fonts:', self.fontList
def concatenate_files(self):
print '\nConcatenating files'
self.catBoxFileList = []
self.catTrFileList = []
for font in self.fontList:
filesInFont = []
for filename in self.baselist:
fileFont = filename.split('.')[1]
number = filename.split('.')[2]
if font == fileFont:
filesInFont.append( filename )
catBoxFilename = '%s.%s.box' % (self.lang, font)
catTrFilename = '%s.%s.tr' % (self.lang, font)
self.catBoxFileList.append(catBoxFilename)
self.catTrFileList.append(catTrFilename)
catBoxFile = codecs.open( catBoxFilename, 'w', 'utf-8')
catTrFile = codecs.open( catTrFilename, 'w', 'utf-8')
print ' Concat files:', catBoxFilename, catTrFilename
for filename in filesInFont:
boxFilename= filename+'.box'
trFilename = filename+'.tr'
boxFile = codecs.open(boxFilename, 'r', 'utf-8')
trFile = codecs.open(trFilename, 'r', 'utf-8')
for line in boxFile:
catBoxFile.write(line)
for line in trFile:
catTrFile.write(line)
print 'Concatenation complete for font: %s' % font
def copy_traineddata(self):
traineddata = self.lang+'.traineddata'
print '\nMoving %s to tessdata directory: %s' % (traineddata, self.tessdataDirectory)
try:
shutil.copy( traineddata, self.tessdataDirectory )
except IOError:
print "Error: You don't have permisson to write to the tessdata directory."
def generate_dawgs(self):
print '\nGenerating DAWGs'
listFilename = '%s.freq_list.txt' % self.lang
if os.path.exists(listFilename):
cmd = ['wordlist2dawg', listFilename, self.lang + '.freq-dawg', self.lang +'.unicharset' ]
subprocess.call(cmd)
listFilename = '%s.word_list.txt' % self.lang
if os.path.exists(listFilename):
cmd = ['wordlist2dawg', listFilename, self.lang + '.word-dawg', self.lang +'.unicharset' ]
subprocess.call(cmd)
def run(self):
filelist = sorted(os.listdir(os.getcwd()))
self.baselist = []
for filename in filelist:
(name, extension) = os.path.splitext(filename)
if extension in ['.tif', '.png'] and name not in self.baselist:
self.ext = extension
self.baselist.append( name )
print self.baselist
self.get_language()
self.get_font_list()
self.generate_tr_files()
self.concatenate_files()
self.generate_unicharset()
self.do_mftraining()
self.do_cntraining()
self.rename_files()
self.generate_dawgs()
self.combine_data()
self.copy_traineddata()
if __name__ == "__main__":
at = AutoTrainer()
at.run()