-
Notifications
You must be signed in to change notification settings - Fork 0
/
logs.py
108 lines (81 loc) · 3.37 KB
/
logs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import argparse
import os
import re
import sys
import pytesseract
from PIL import Image
EXCLUDE_KEYWORDS = [
'amr', 'lift', 'animal', 'alarm', 'structure', 'medical', 'persons',
'parking', 'dog', 'accident', 'assist', 'paper', 'barking', '911',
'repossession', 'runaway', 'odor', 'alarm', 'ambulance', 'service',
'vehicle', 'traffic', 'trash', 'info', 'repo', 'dav', 'lift', 'street',
'wires', 'utility', 'fireworks', 'water'
]
ID_RE = re.compile(r'^\d{9}')
def main():
parser = argparse.ArgumentParser()
parser.add_argument('directory', type=str, help='Directory containing images to parse')
parser.add_argument('filter_nature', type=str, help='Comma separated list of natures to filter by', nargs='?', default=None)
parser.add_argument('--exclude', dest='exclude_lines', action='store_true',
default=False, help='Exclude non emergency nature')
parser.add_argument('--lines', dest='output_lines', action='store_true',
default=True, help='Output call lines only')
parser.add_argument('--ids', dest='output_ids', action='store_true',
default=False, help='Output ids only')
args = parser.parse_args()
directory = args.directory
nature_filters = args.filter_nature
exclude_lines = args.exclude_lines
if nature_filters:
nature_filters = [f.strip() for f in nature_filters.split(',')]
try:
pages = []
for filename in os.listdir(directory):
if filename.lower().endswith('.jpeg') or filename.lower().endswith('.jpg'):
sys.stdout.write('.')
sys.stdout.flush()
filepath = os.path.join(directory, filename)
# Images to line text
text = pytesseract.image_to_string(Image.open(filepath), config='--psm 6')
pages.append(text.split('\n'))
except FileNotFoundError:
sys.stderr.write('Directory not found.\n')
sys.exit(1)
if not pages:
sys.stderr.write('No images found.\n')
sys.exit(1)
sys.stdout.write('\n')
sys.stdout.write('{} pages processed.\n'.format(len(pages)))
# Sanitize lines
clean_lines = []
for page in pages:
for line in page:
# Skip any lines that don't start with 9 digit call ID
id_match = ID_RE.match(line)
if not id_match:
continue
# Skip any lines with excluded reasons
if exclude_lines and any(word.lower() in line.lower() for word in EXCLUDE_KEYWORDS):
continue
# If a filter is set, skip any that don't match
if nature_filters:
if not any(word.lower() in line.lower() for word in nature_filters):
continue
clean_lines.append(
# Call ID, line
(id_match.group(), line)
)
if not clean_lines:
sys.stderr.write('No results.\n')
sys.exit(1)
sys.stdout.write('{} lines found.\n\n'.format(len(clean_lines)))
# Output comma separated list of IDs only
if args.output_ids:
output_ids = ', '.join([l[0] for l in clean_lines])
sys.stdout.write('{}\n'.format(output_ids))
# Output full lines
else:
for line in clean_lines:
sys.stdout.write('{}\n'.format(line[1]))
if __name__ == "__main__":
main()