forked from LanguageMachines/PICCL
-
Notifications
You must be signed in to change notification settings - Fork 1
/
ocr.nf
executable file
·290 lines (233 loc) · 10.3 KB
/
ocr.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
#!/usr/bin/env nextflow
/*
vim: syntax=groovy
-*- mode: groovy;-*-
*/
log.info "--------------------------"
log.info "OCR Pipeline"
log.info "--------------------------"
def env = System.getenv()
//Set default parameter values
params.virtualenv = env.containsKey('VIRTUAL_ENV') ? env['VIRTUAL_ENV'] : ""
params.outputdir = "ocr_output"
params.inputtype = "pdf"
params.pdfhandling = "single"
params.seqdelimiter = "_"
//Output usage information if --help is specified
if (params.containsKey('help')) {
log.info "Usage:"
log.info " ocr.nf [PARAMETERS]"
log.info ""
log.info "Mandatory parameters:"
log.info " --inputdir DIRECTORY Input directory"
log.info " --language LANGUAGE Language (iso-639-3)"
log.info ""
log.info "Optional parameters:"
log.info " --inputtype STR Specify input type, the following are supported:"
log.info " pdf (extension *.pdf) - Scanned PDF documents (image content) [default]"
log.info " tif (\$document_\$sequencenumber.tif) - Images per page (adhere to the naming convention!)"
log.info " jpg (\$document_\$sequencenumber.jpg) - Images per page"
log.info " png (\$document_\$sequencenumber.png) - Images per page"
log.info " gif (\$document_\$sequencenumber.gif) - Images per page"
log.info " djvu (extension *.djvu)"
log.info " (The underscore delimiter may optionally be changed using --seqdelimiter)"
log.info " --outputdir DIRECTORY Output directory (FoLiA documents) [default: " + params.outputdir + "]"
log.info " --virtualenv PATH Path to Python Virtual Environment to load (usually path to LaMachine)"
log.info " --pdfhandling reassemble Reassemble/merge all PDFs with the same base name and a number suffix; this can"
log.info " for instance reassemble a book that has its chapters in different PDFs."
log.info " Input PDFs must adhere to a \$document_\$sequencenumber.pdf convention."
log.info " (The underscore delimiter may optionally be changed using --seqdelimiter)"
log.info " --seqdelimiter Sequence delimiter in input files (defaults to: _)"
log.info " --seqstart What input field is the sequence number (may be a negative number to count from the end), default: -2"
exit 2
}
//Check mandatory parameters and produce sensible error messages
if (!params.containsKey('inputdir')) {
log.info "Error: Missing --inputdir parameter, see --help for usage details"
} else {
def dircheck = new File(params.inputdir)
if (!dircheck.exists()) {
log.info "Error: Specified input directory does not exist"
exit 2
}
}
if (!params.containsKey('language')) {
log.info "Error: Missing --language parameter, see --help for usage details"
exit 2
}
if ((params.inputtype == "pdf") && (params.pdfhandling == "reassemble")) {
// The reassemble option was selected, this means
// that PDF input filenames should adhere to the
// $documentname-$sequencenumber.pdf convention
// which we turn into one $documentname.pdf
//Group $documentname-$sequencenumber.pdf in a channel emitting a tuple consisting of a documentname and a list of (unordered) sequence pdf files
// e.g. the channel emits items such as (documentname, ["documentname-1.pdf", "documentname-2.pdf"] )
Channel.fromPath(params.inputdir+"/**.pdf")
.map { partfile -> partfile.baseName.find(params.seqdelimiter) != null ? tuple(partfile.baseName.tokenize(params.seqdelimiter)[0..-2].join(params.seqdelimiter), partfile) : tuple(partfile.baseName, partfile) }
.groupTuple()
.set { pdfparts }
process reassemble_pdf {
/*
Reassemble a PDF 'book' (or whatever) from its parts (e.g, chapters, pages), using pdfunite
*/
input:
set val(documentname), file(pdffiles) from pdfparts //consume a documentname and list of pdffiles pertaining to that document
output:
file "${documentname}.pdf" into pdfdocuments
script:
"""
#!/bin/bash
count=\$(ls *.pdf | wc -l)
if [ \$count -eq 1 ]; then
cp \$(ls *.pdf) "${documentname}.pdf"
elif [ \$count -eq 0 ]; then
echo "No input PDFs to merge!">&2
exit 5
else
pdfinput=\$(ls -1v *.pdf | tr '\\n' ' ') #performs a *natural* sort and quotes
pdfunite \$pdfinput "${documentname}.pdf"
fi
"""
}
}
if (params.inputtype == "djvu") {
//Set up an input channel for DJVU documents (globs recursively in the input directory)
djvudocuments = Channel.fromPath(params.inputdir+"/**.djvu").view { "Input document (djvu): " + it }
process djvu {
/*
Extract TIF images from DJVU
*/
input:
file djvudocument from djvudocuments
output:
set val("${djvudocument.baseName}"), file("${djvudocument.baseName}*.tif") into djvuimages
script:
"""
#!/bin/bash
ddjvu -format=tiff -eachpage "${djvudocument}" "${djvudocument.baseName}_%d.tif"
"""
}
//Convert (documentname, [imagefiles]) channel to a channel emitting (documentname, imagefile) tuples
djvuimages
.collect { documentname, imagefiles -> [[documentname],imagefiles].combinations() }
.flatten()
.collate(2)
.set { pageimages }
} else if ((params.inputtype == "pdf") || (params.inputtype == "pdfimages")) { //2nd condition is needed for backwards compatibility
if (params.pdfhandling == "single") {
//pdfhandling simple means we don't need to reassemble (as done by the prior process), so
//we can just set up the input channel with the PDFs
pdfdocuments = Channel.fromPath(params.inputdir+"/**.pdf").view { "Input document (pdf): " + it }
}
process pdfimages {
/*
Extract images from PDF using pdftoppm
*/
input:
file pdfdocument from pdfdocuments
output:
set val("${pdfdocument.baseName}"), file("${pdfdocument.baseName}*.tif") into pdfimages
script:
"""
#!/bin/bash
pdftoppm -tiff "${pdfdocument}" "${pdfdocument.baseName}"
"""
// Probably better to have sth. like the following?
//if r != 0:
//print("pdfimages failed...", file=sys.stderr)
//sys.exit(r)
}
//Convert (documentname, [imagefiles]) channel to a channel emitting (documentname, imagefile) tuples
pdfimages
.collect { documentname, imagefiles -> [[documentname],imagefiles].combinations() }
.flatten()
.collate(2)
.set { pageimages }
} else if ((params.inputtype == "jpg") || (params.inputtype == "jpeg") || (params.inputtype == "tif") || (params.inputtype == "tiff") || (params.inputtype == "png") || (params.inputtype == "gif")) {
//The input is a set of images: $documentname_$sequencenr.$extension (where $sequencenr can be alphabetically sorted ), Tesseract supports a variety of formats
//we group and transform the data into a pageimages channel which will emit (documentname, pagefile) tuples
Channel
.fromPath(params.inputdir+"/**." + params.inputtype)
.map { pagefile ->
def documentname = pagefile.baseName.find(params.seqdelimiter) != null ? pagefile.baseName.tokenize(params.seqdelimiter)[0..-2].join(params.seqdelimiter) : pagefile.baseName
[ documentname, pagefile ]
}
.set { pageimages }
} else {
log.error "No such input type: " + params.inputtype
exit 2
}
process tesseract {
/*
Do the actual OCR using Tesseract: outputs a hOCR document for each input page image
*/
input:
set val(documentname), file(pageimage) from pageimages
val language from params.language
output:
set val(documentname), file("${pageimage.baseName}" + ".hocr") into ocrpages
script:
"""
tesseract "${pageimage}" "${pageimage.baseName}" -c "tessedit_create_hocr=T" -l "${language}"
"""
}
process ocrpages_to_foliapages {
/*
Convert Tesseract hOCR output to FoLiA
*/
errorStrategy 'ignore' //not the most elegant solution and a bit dangerous! But sometimes 'empty' hocr files get fed that won't produce a folia file
input:
set val(documentname), file(pagehocr) from ocrpages
val virtualenv from params.virtualenv
//when:
//pagehocr.text =~ /ocrx_word/
output:
set val(documentname), file("FH-${pagehocr.baseName}" + "*.folia.xml") into foliapages //TODO: verify this also works if input is not TIF or PDF?
script:
"""
#set up the virtualenv (bit unelegant currently, but we have to do this for each process to ensure the LaMachine environment works)
set +u
if [ ! -z "${virtualenv}" ]; then
source ${virtualenv}/bin/activate
fi
set -u
FoLiA-hocr --prefix "FH-" -O ./ -t 1 "${pagehocr}"
"""
}
//Collect all pages for a given document
//transforms [(documentname, hocrpage)] output to [(documentname, [hocrpages])], grouping pages per base name
foliapages
.groupTuple(sort: {
//sort by file name (not full path)
file(it).getName()
})
.set { groupfoliapages }
process foliacat {
/*
Concatenate separate FoLiA pages pertaining to the same document into a single document again
*/
publishDir params.outputdir, mode: 'copy', overwrite: true //publish the output for the end-user to see (this is the final output)
input:
set val(documentname), file("*.tif.folia.xml") from groupfoliapages
val virtualenv from params.virtualenv
output:
file "${documentname}.ocr.folia.xml" into foliaoutput
script:
"""
#!/bin/bash
set +u
if [ ! -z "${virtualenv}" ]; then
source ${virtualenv}/bin/activate
fi
set -u
if [ -f .tif.folia.xml ]; then
#only one file, nothing to cat
cp .tif.folia.xml "${documentname}.ocr.folia.xml"
else
foliainput=\$(ls -1v *.tif.folia.xml | tr '\\n' ' ')
foliacat -i "${documentname}" -o "${documentname}.ocr.folia.xml" \$foliainput
fi
"""
}
//explicitly report the final documents created to stdout
foliaoutput.subscribe { println "OCR output document written to " + params.outputdir + "/" + it.name }