-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathocr.py
59 lines (48 loc) · 1.91 KB
/
ocr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import os
import argparse
from tqdm import tqdm
from subprocess import Popen, PIPE
if os.name == 'nt':
processingDir = "\\\\Lincoln\\Library\\SPE_Processing\\backlog"
else:
processingDir = "/media/Library/SPE_Processing/backlog"
parser = argparse.ArgumentParser()
parser.add_argument("package", help="ID for package you are processing, i.e. 'ua950.012_Xf5xzeim7n4yE6tjKKHqLM'.")
parser.add_argument("-p", "--path", help="Subpath, relative to derivatives directory which will only convert files there.", default=None)
args = parser.parse_args()
if "_" in args.package:
ID = args.package.split("_")[0]
elif "-" in args.package:
ID = args.package.split("-")[0]
else:
raise Exception("ERROR: " + str(args.package) + " is not a valid processing package.")
package = os.path.join(processingDir, ID, args.package)
masters = os.path.join(package, "masters")
derivatives = os.path.join(package, "derivatives")
metadata = os.path.join(package, "metadata")
def process(cmd):
#p = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE)
p = Popen(cmd, stdout=PIPE, stderr=PIPE)
stdout, stderr = p.communicate()
if len(stdout) > 0:
print (stdout)
if len(stderr) > 0:
print (stderr)
if args.path:
ocrPath = os.path.join(derivatives, os.path.normpath(args.path))
if not os.path.isdir(derivatives):
raise Exception("ERROR: subpath " + args.path + " relative to derivatives is not a valid path.")
else:
ocrPath = derivatives
for root, dirs, files in os.walk(ocrPath):
for file in tqdm(files):
if file.lower().endswith(".pdf"):
#cmd = ["ocrmypdf", "--deskew", "--clean"]
cmd = ["ocrmypdf", "--deskew"]
filepath = os.path.join(root, file)
cmd.append(filepath)
cmd.append(filepath)
#print ("\n\n")
#print (" ".join(cmd))
print ("processing " + file + "...")
process(cmd)