-
Notifications
You must be signed in to change notification settings - Fork 0
/
coordinate_extraction.py
328 lines (262 loc) · 14 KB
/
coordinate_extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
"""This module contains the CoordinateExtractor class."""
from __future__ import annotations
import abc
import logging
from dataclasses import dataclass
import fitz
import regex
from stratigraphy.util.extract_text import extract_text_lines
from stratigraphy.util.line import TextLine
from stratigraphy.util.util import read_params
logger = logging.getLogger(__name__)
COORDINATE_ENTRY_REGEX = r"(?:([12])[\.\s'‘’]{0,2})?(\d{3})[\.\s'‘’]{0,2}(\d{3})\.?\d?"
@dataclass
class CoordinateEntry:
"""Dataclass to represent a coordinate entry."""
coordinate_value: int
def __repr__(self):
if self.coordinate_value > 1e5:
return f"{self.coordinate_value:,}".replace(",", "'")
else: # Fix for LV03 coordinates with leading 0
return f"{self.coordinate_value:07,}".replace(",", "'")
@dataclass
class Coordinate(metaclass=abc.ABCMeta):
"""Abstract class for coordinates."""
east: CoordinateEntry
north: CoordinateEntry
rect: fitz.Rect
page: int
def __post_init__(self):
# east always greater than north by definition. Irrespective of the leading 1 or 2
if self.east.coordinate_value < self.north.coordinate_value:
logger.info("Swapping coordinates.")
self.north, self.east = self.east, self.north
def __str__(self):
return f"E: {self.east}, N: {self.north}"
def to_json(self):
return {
"E": self.east.coordinate_value,
"N": self.north.coordinate_value,
"rect": [self.rect.x0, self.rect.y0, self.rect.x1, self.rect.y1],
"page": self.page,
}
@abc.abstractmethod
def is_valid(self):
pass
@staticmethod
def from_values(east: int, north: int, rect: fitz.Rect, page: int) -> Coordinate | None:
if 1e6 < east < 1e7:
return LV95Coordinate(
CoordinateEntry(coordinate_value=east), CoordinateEntry(coordinate_value=north), rect, page
)
elif east < 1e6:
return LV03Coordinate(
CoordinateEntry(coordinate_value=east), CoordinateEntry(coordinate_value=north), rect, page
)
else:
logger.warning(f"Invalid coordinates format. Got E: {east}, N: {north}")
return None
@staticmethod
def from_json(input: dict):
return Coordinate.from_values(
east=input["E"], north=input["N"], rect=fitz.Rect(input["rect"]), page=input["page"]
)
@dataclass
class LV95Coordinate(Coordinate):
"""Dataclass to represent a coordinate in the LV95 format."""
def is_valid(self):
"""Reference: https://de.wikipedia.org/wiki/Schweizer_Landeskoordinaten#Beispielkoordinaten."""
return 2324800 < self.east.coordinate_value < 2847500 and 1074000 < self.north.coordinate_value < 1302000
@dataclass
class LV03Coordinate(Coordinate):
"""Dataclass to represent a coordinate in the LV03 format."""
def is_valid(self):
"""Reference: https://de.wikipedia.org/wiki/Schweizer_Landeskoordinaten#Beispielkoordinaten.
To account for uncertainties in the conversion of LV03 to LV95, we allow a margin of 2.
"""
return 324798 < self.east.coordinate_value < 847502 and 73998 < self.north.coordinate_value < 302002
class CoordinateExtractor:
"""Extracts coordinates from a PDF document."""
def __init__(self, document: fitz.Document):
"""Initializes the CoordinateExtractor object.
Args:
document (fitz.Document): A PDF document.
"""
self.doc = document
self.coordinate_keys = read_params("matching_params.yml")["coordinate_keys"]
def get_coordinates_with_x_y_labels(self, lines: list[TextLine], page: int) -> list[Coordinate]:
"""Find coordinates with explicit "X" and "Y" labels from the text lines.
Args:
lines (list[TextLine]): all the lines of text to search in
page (int): the page number (1-based) of the PDF document
Returns:
list[Coordinate]: all found coordinates
"""
# In this case, we can allow for some whitespace in between the numbers.
# In some older borehole profile the OCR may recognize whitespace between two digits.
pattern_x = regex.compile(r"X[=:\s]{0,3}" + COORDINATE_ENTRY_REGEX, flags=regex.IGNORECASE)
x_matches = CoordinateExtractor._match_text_with_rect(lines, pattern_x)
pattern_y = regex.compile(r"Y[=:\s]{0,3}" + COORDINATE_ENTRY_REGEX, flags=regex.IGNORECASE)
y_matches = CoordinateExtractor._match_text_with_rect(lines, pattern_y)
# We are only checking the 1st x-value with the 1st y-value, the 2nd x-value with the 2nd y-value, etc.
# In some edge cases, the matched x_values and y-values might not be aligned / equal in number. However,
# we ignore this for now, as almost always, the 1st x and y values are already the ones that we are looking
# for.
found_coordinates = []
for x_match, y_match in zip(x_matches, y_matches, strict=False):
rect = fitz.Rect()
rect.include_rect(x_match[1])
rect.include_rect(y_match[1])
coordinates = Coordinate.from_values(
east=int("".join(x_match[0].groups(default=""))),
north=int("".join(y_match[0].groups(default=""))),
rect=rect,
page=page,
)
if coordinates is not None and coordinates.is_valid():
found_coordinates.append(coordinates)
return found_coordinates
def find_coordinate_key(self, lines: list[TextLine], allowed_errors: int = 3) -> TextLine | None: # noqa: E501
"""Finds the location of a coordinate key in a string of text.
This is useful to reduce the text within which the coordinates are searched. If the text is too large
false positive (found coordinates that are no coordinates) are more likely.
The function allows for a certain number of errors in the key. Errors are defined as insertions, deletions
or substitutions of characters (i.e. Levenshtein distance). For more information of how errors are defined see
https://github.com/mrabarnett/mrab-regex?tab=readme-ov-file#approximate-fuzzy-matching-hg-issue-12-hg-issue-41-hg-issue-109.
Args:
lines (list[TextLine]): Arbitrary text lines to search in.
allowed_errors (int, optional): The maximum number of errors (Levenshtein distance) to consider a key
contained in text. Defaults to 3 (guestimation; no optimisation done yet).
Returns:
TextLine | None: The line of the coordinate key found in the text.
"""
matches = []
for key in self.coordinate_keys:
pattern = regex.compile(r"\b(" + key + "){e<" + str(allowed_errors) + r"}\b", flags=regex.IGNORECASE)
for line in lines:
match = pattern.search(line.text)
if match:
matches.append((line, sum(match.fuzzy_counts)))
# if no match was found, return None
if len(matches) == 0:
return None
best_match = min(matches, key=lambda x: x[1])
return best_match[0]
def get_coordinates_near_key(self, lines: list[TextLine], page: int, page_width: float) -> list[Coordinate]:
"""Find coordinates from text lines that are close to an explicit "coordinates" label.
Also apply some preprocessing to the text of those text lines, to deal with some common (OCR) errors.
Args:
lines (list[TextLine]): all the lines of text to search in
page (int): the page number (1-based) of the PDF document
page_width (float): the width of the current page (in points / PyMuPDF coordinates)
Returns:
list[Coordinate]: all found coordinates
"""
# find the key that indicates the coordinate information
coordinate_key_line = self.find_coordinate_key(lines)
if coordinate_key_line is None:
return []
# find the lines of the text that are close to an identified coordinate key.
key_rect = coordinate_key_line.rect
# look for coordinate values to the right and/or immediately below the key
coordinate_search_rect = fitz.Rect(key_rect.x0, key_rect.y0, page_width, key_rect.y1 + 3 * key_rect.height)
coord_lines = [line for line in lines if line.rect.intersects(coordinate_search_rect)]
def preprocess(value: str) -> str:
value = value.replace(",", ".")
value = value.replace("'", ".")
value = value.replace("o", "0") # frequent ocr error
value = value.replace("\n", " ")
return value
return self.get_coordinates_from_lines(coord_lines, page, preprocess)
@staticmethod
def get_coordinates_from_lines(lines: list[TextLine], page: int, preprocess=lambda x: x) -> list[Coordinate]:
r"""Matches the coordinates in a string of text.
The query searches for a pair of coordinates of 6 or 7 digits, respectively. The pair of coordinates
must at most be separated by 4 characters. The regular expression is designed to match a wide range of
coordinate formats for the Swiss coordinate systems LV03 and LV95, including 'X=123.456 Y=123.456',
'X:123.456, Y:123.456', 'X 123 456 Y 123 456', whereby the X and Y are optional.
The full regular expressions query is:
"[XY]?[=:\s]{0,2}(?:([12])[\.\s'‘’]{0,2})?(\d{3})[\.\s'‘’]{0,2}(\d{3})\.?\d?.{0,4}?[XY]?[=:\s]{0,2}(?:([12])[\.\s'‘’]{0,2})?(\d{3})[\.\s'‘’]{0,2}(\d{3})\.?\d?"
Query explanation:
- [XY]?: This matches an optional 'X' or 'Y'. The ? makes the preceding character optional.
- [=:\s]{0,2}: This matches zero to two occurrences of either an equals sign, a colon, or a whitespace
character.
- (?:([12])[\.\s'‘’]{0,2})?: This is a non-capturing group (indicated by ?:), which means it groups the
enclosed characters but does not create a backreference. It matches an optional '1' or '2' (which is
captured in a group) followed by zero to two occurrences of a period, space, or single quote.
- \d{3}: This matches exactly three digits.
- [\.\s'‘’]{0,2}: This matches zero to two occurrences of a period, space, or single quote.
- \d{3}: This again matches exactly three digits.
- \.?\d?: This matches an optional period followed by an optional digit.
- .{0,4}?: This matches up to four occurrences of any characters, except newline.
The second half of the regular expression repeats the pattern, allowing it to match a pair of coordinates
in the format 'X=123.456 Y=123.456', with some flexibility for variations in the format. For example, it
can also match 'X:123.456, Y:123.456', 'X 123 456 Y 123 456', and so on.
Args:
lines (list[TextLine]): Arbitrary string of text.
page (int): the page number (1-based) of the PDF document
preprocess: function that takes a string and returns a preprocessed string # TODO add type
Returns:
list[Coordinate]: A list of potential coordinates
"""
full_regex = regex.compile(
r"(?:[XY][=:\s]{0,2})?" + COORDINATE_ENTRY_REGEX + r".{0,4}?[XY]?[=:\s]{0,2}" + COORDINATE_ENTRY_REGEX
)
potential_coordinates = [
Coordinate.from_values(
east=int("".join(match.groups(default="")[:3])),
north=int("".join(match.groups(default="")[3:])),
rect=rect,
page=page,
)
for match, rect in CoordinateExtractor._match_text_with_rect(lines, full_regex, preprocess)
]
return [
coordinates for coordinates in potential_coordinates if coordinates is not None and coordinates.is_valid()
]
@staticmethod
def _match_text_with_rect(
lines: list[TextLine], pattern: regex.Regex, preprocess=lambda x: x
) -> list[(regex.Match, fitz.Rect)]:
full_text = ""
lines_with_position = []
for line in lines:
preprocessed_text = preprocess(line.text)
lines_with_position.append(
{"line": line, "start": len(full_text), "end": len(full_text) + len(preprocessed_text)}
)
full_text += preprocessed_text + " "
results = []
for match in pattern.finditer(full_text):
match_lines = [
entry["line"]
for entry in lines_with_position
if entry["end"] >= match.start() and entry["start"] < match.end()
]
rect = fitz.Rect()
for line in match_lines:
rect.include_rect(line.rect)
results.append((match, rect))
return results
def extract_coordinates(self) -> Coordinate | None:
"""Extracts the coordinates from a borehole profile.
Processes the borehole profile page by page and tries to find the coordinates in the respective text of the
page.
Algorithm description:
1. search for coordinates with explicit 'X' and 'Y' labels
2. if that gives no results, search for coordinates close to an explicit "coordinates" label
3. if that gives no results either, try to detect coordinates in the full text
Returns:
Coordinate | None: the extracted coordinates (if any)
"""
for page in self.doc:
lines = extract_text_lines(page)
page_number = page.number + 1 # page.number is 0-based
found_coordinates = (
self.get_coordinates_with_x_y_labels(lines, page_number)
or self.get_coordinates_near_key(lines, page_number, page.rect.width)
or self.get_coordinates_from_lines(lines, page_number)
)
if len(found_coordinates) > 0:
return found_coordinates[0]
logger.info("No coordinates found in this borehole profile.")