-
Notifications
You must be signed in to change notification settings - Fork 2
/
data_extractor.py
159 lines (121 loc) · 6.3 KB
/
data_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
"""Module for data extraction from stratigraphy data files.
This module defines the DataExtractor class for extracting data from stratigraphy data files.
"""
import logging
from abc import ABC, ABCMeta, abstractmethod
from dataclasses import dataclass
import fitz
import regex
from stratigraphy.data_extractor.utility import get_lines_near_rect
from stratigraphy.lines.line import TextLine
from stratigraphy.util.util import read_params
logger = logging.getLogger(__name__)
@dataclass(kw_only=True)
class ExtractedFeature(metaclass=ABCMeta):
"""Class for extracted feature information."""
rect: fitz.Rect # The rectangle that contains the extracted information
page: int # The page number of the PDF document
@abstractmethod
def is_valid(self) -> bool:
"""Checks if the information is valid.
Returns:
bool: True if the information is valid, otherwise False.
"""
pass
class DataExtractor(ABC):
"""Abstract class for data extraction from stratigraphy data files.
This class defines the interface for extracting data from stratigraphy data files.
"""
doc: fitz.Document = None
feature_keys: list[str] = None
feature_fp_keys: list[str] = None
feature_name: str = None
# How much to the left of a key do we look for the feature information, as a multiple of the key line width
search_left_factor: float = 0
# How much to the right of a key do we look for the feature information, as a multiple of the key line width
search_right_factor: float = 0
# How much below a key do we look for the feature information, as a multiple of the key line height
search_below_factor: float = 0
# How much above a key do we look for the feature information, as a multiple of the key line height
search_above_factor: float = 0
preprocess_replacements: dict[str, str] = {}
def __init__(self, document: fitz.Document):
"""Initializes the DataExtractor object.
Args:
document (fitz.Document): A PDF document.
feature_name (str): The name of the feature to extract.
"""
if not self.feature_name:
raise ValueError("Feature name must be specified.")
self.doc = document
self.feature_keys = read_params("matching_params.yml")[f"{self.feature_name}_keys"]
self.feature_fp_keys = read_params("matching_params.yml")[f"{self.feature_name}_fp_keys"] or []
def preprocess(self, value: str) -> str:
for old, new in self.preprocess_replacements.items():
value = value.replace(old, new)
return value
def find_feature_key(self, lines: list[TextLine], allowed_error_rate: float = 0.2) -> list[TextLine]: # noqa: E501
"""Finds the location of a feature key in a string of text.
This is useful to reduce the text within which the feature is searched. If the text is too large
false positive (found feature that is actually not the feature) are more likely.
The function allows for a certain number of errors in the key. Errors are defined as insertions, deletions
or substitutions of characters (i.e. Levenshtein distance). For more information of how errors are defined see
https://github.com/mrabarnett/mrab-regex?tab=readme-ov-file#approximate-fuzzy-matching-hg-issue-12-hg-issue-41-hg-issue-109.
Args:
lines (list[TextLine]): Arbitrary text lines to search in.
allowed_error_rate (float, optional): The maximum number of errors (Levenshtein distance) to consider a key
contained in text, as a percentage of the key length. Defaults to 0.2
(guestimation; no optimisation done yet).
Returns:
list[TextLine]: The lines of the feature key found in the text.
"""
matches = set()
for key in self.feature_keys:
allowed_errors = int(len(key) * allowed_error_rate)
if len(key) < 5:
# If the key is very short, do an exact match
pattern = regex.compile(r"(\b" + regex.escape(key) + r"\b)", flags=regex.IGNORECASE)
else:
# Allow for a certain number of errors in longer keys
pattern = regex.compile(
r"(\b" + regex.escape(key) + r"\b){e<=" + str(allowed_errors) + r"}", flags=regex.IGNORECASE
)
for line in lines:
match = pattern.search(line.text)
if match and (not any(fp_key in line.text for fp_key in self.feature_fp_keys)):
# Check if there is a match and the matched string is not in the false positive list
matches.add(line)
return list(matches)
def get_lines_near_key(self, lines, key_line: TextLine) -> list[TextLine]:
"""Find the lines of the text that are close to an identified key.
The line of the identified key is always returned as the first item in the list.
Args:
lines (list[TextLine]): Arbitrary text lines to search in.
key_line (TextLine): The line of the identified key.
Returns:
list[TextLine]: The lines close to the key.
"""
key_rect = key_line.rect
feature_lines = self.get_lines_near_rect(lines, key_rect)
# Insert key_line first and remove duplicates
feature_lines.insert(0, key_line)
feature_lines = list(dict.fromkeys(feature_lines))
# Sort by vertical distance between the top of the feature line and the top of key_line
feature_lines_sorted = sorted(feature_lines, key=lambda line: abs(line.rect.y0 - key_line.rect.y0))
return feature_lines_sorted
def get_lines_near_rect(self, lines, rect: fitz.Rect) -> list[TextLine]:
"""Find the lines of the text that are close to a given rectangle.
Args:
lines (list[TextLine]): Arbitrary text lines to search in.
rect (fitz.Rect): The rectangle to search around.
Returns:
list[TextLine]: The lines close to the rectangle.
"""
return get_lines_near_rect(
self.search_left_factor,
self.search_right_factor,
self.search_above_factor,
self.search_below_factor,
lines,
rect,
)