-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfunctionsXML.py
171 lines (150 loc) · 5.91 KB
/
functionsXML.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
###############################################################################
#
# Author: Erik Tjong Kim Sang
# Project: REE-HDSC reseach project of the Radboud University Nijmegen and
# the Netherlands eScience Center
# Link: https://github.com/ree-hdsc/ree-hdsc
#
# Revised by: Lisa Hoek
# Thesis: Extracting Entities from Handwritten Civil Records using
# Handwritten Text Recognition and Regular Expressions
#
###############################################################################
import ast, json, os, re
import xml.etree.ElementTree as ET
def get_text_from_file(file_name):
tree = ET.parse(file_name)
root = tree.getroot()
text, metadata = get_text_from_xml(root)
textregions = get_textregions_from_xml(root)
return text, metadata, textregions
def get_value_string(fields):
value_string = fields.pop(0)
while fields and not re.search("}$", value_string):
value_string += fields.pop(0)
return value_string
def string_to_dict(string):
string = re.sub("^{", "", string)
string = re.sub(";}$", "", string)
pairs = string.split(";")
data = {}
for pair in pairs:
pair_data = pair.split(":")
data[pair_data[0]] = pair_data[1]
return data
def process_custom_attrib(custom_line):
fields = custom_line.split()
data = {}
while fields:
key = fields.pop(0)
if not fields:
data[key] = []
else:
value_string = string_to_dict(get_value_string(fields))
if key in data:
data[key].append(value_string)
else:
data[key] = [value_string]
return data
def process_textline_attrib(attribs):
for attrib in attribs:
if attrib == "custom":
return process_custom_attrib(attribs[attrib])
def add_length_to_offset(metadata_value, text_length):
for key in metadata_value:
if key == "offset":
metadata_value[key] = int(metadata_value[key]) + text_length
return metadata_value
def expand_metadata(metadata_base, metadata_new, text_length):
for key in metadata_new:
if key in metadata_base:
for value in metadata_new[key]:
metadata_base[key].append(add_length_to_offset(value, text_length))
else:
metadata_base[key] = []
for value in metadata_new[key]:
metadata_base[key].append(add_length_to_offset(value, text_length))
def get_text_from_xml(root):
text = ""
metadata = {}
for textline in root.findall(".//{*}TextLine"):
expand_metadata(metadata, process_textline_attrib(textline.attrib), len(text))
custom_dict = make_custom_dict(textline.attrib)
for unicode in textline.findall("./{*}TextEquiv/{*}Unicode"):
if unicode.text != None:
text += remove_strikethroughs(unicode.text, custom_dict) + "\n"
return text, metadata
def make_custom_dict(text_line_attributes):
if "custom" not in text_line_attributes:
return {}
custom_tokens = text_line_attributes["custom"].split()
custom_dict = {}
while custom_tokens:
custom_key = custom_tokens.pop(0)
custom_value = custom_tokens.pop(0)
while custom_tokens and not re.search("}$", custom_value):
custom_value += " " + custom_tokens.pop(0)
if custom_key in custom_dict:
custom_dict[custom_key].append(ast.literal_eval(json_string_add_quotes(custom_value)))
else:
custom_dict[custom_key] = [ast.literal_eval(json_string_add_quotes(custom_value))]
return custom_dict
def remove_strikethroughs(text_line, custom_dict):
if "textStyle" not in custom_dict:
return text_line
chars = list(text_line)
for strikethrough in custom_dict["textStyle"]:
if "strikethrough" in strikethrough:
start = int(strikethrough["offset"])
for i in range(start, start + int(strikethrough["length"])):
chars[i] = " "
return "".join(chars)
def json_string_add_quotes(string):
return re.sub("{ *", "{ '",
re.sub(": *", "': '",
re.sub("; *", "', '",
re.sub("} *'", "} ",
re.sub("; *}", "' }", string)))))
def convert_to_lists_coords(coords):
pairs = coords.split()
x_coords = []
y_coords = []
for pair in pairs:
x, y = pair.split(",")
x_coords.append(int(x))
y_coords.append(int(y))
return x_coords, y_coords
def get_extreme_points_coords(coords):
if coords == "":
return 0, 0, 0, 0
x_coords, y_coords = convert_to_lists_coords(coords)
return min(x_coords), max(x_coords), min(y_coords), max(y_coords)
def get_textregions_from_xml(root):
textregions = []
for textregion in root.findall(".//{*}TextRegion"):
for coords in textregion.findall("./{*}Coords"):
textregions.append(get_extreme_points_coords(coords.attrib["points"]))
return textregions
def make_file_id(file_name):
try:
year = file_name.split()[1]
folio_nbr = re.sub("\..*$", "", file_name.split()[-1])
district = re.sub("(Buiten|distr.|Stad)", "", "".join(file_name.split()[2: -1]))
if district == "":
district = "1e"
return "-".join([year, district, folio_nbr])
except:
return file_name
def print_with_color(string, color_code=1):
print(f"\x1b[3{color_code}m{string}\x1b[m", end="")
def read_files(data_dir):
texts, metadata, textregions = ({}, {}, {})
for file_name in os.listdir(data_dir):
if re.search("\.xml$", file_name):
#file_id = make_file_id(file_name)
file_id = file_name[:-4]
try:
texts[file_id], metadata[file_id], textregions[file_id] = get_text_from_file(os.path.join(data_dir, file_name))
except:
print_with_color(f"error processing file {file_id}\n")
return texts, metadata, textregions