-
Notifications
You must be signed in to change notification settings - Fork 2
/
textblock.py
185 lines (148 loc) · 6.69 KB
/
textblock.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
"""This module contains the TextBlock class, which represents a block of text in a PDF document."""
from __future__ import annotations
from dataclasses import dataclass
from typing import Any
import fitz
import numpy as np
from stratigraphy.lines.line import TextLine
@dataclass
class MaterialDescription:
"""Class to represent a material description in a PDF document.
Note: This class is similar to the TextBlock class. As such it has the attributes text and rect.
But it does not have the attribute lines and is missing class methods. TextBlock is used during the extraction
process where more fine-grained information is required. We lose this "fine-grainedness" when we annotate
the boreholes in label-studio.
"""
text: str
rect: fitz.Rect
def to_json(self):
"""Convert the MaterialDescription object to a JSON serializable dictionary."""
return {
"text": self.text,
"rect": [self.rect.x0, self.rect.y0, self.rect.x1, self.rect.y1],
}
@dataclass
class TextBlock:
"""Class to represent a block of text in a PDF document.
A TextBlock is a collection of Lines surrounded by Lines.
It is used to represent a block of text in a PDF document.
"""
lines: list[TextLine]
is_terminated_by_line: bool = False
def __post_init__(self):
self.line_count = len(self.lines)
self.text = " ".join([line.text for line in self.lines])
if self.line_count:
self.rect = fitz.Rect(
min(line.rect.x0 for line in self.lines),
min(line.rect.y0 for line in self.lines),
max(line.rect.x1 for line in self.lines),
max(line.rect.y1 for line in self.lines),
)
else:
self.rect = fitz.Rect()
# go through all the lines and check if they are on the same page
page_number_set = set(line.page_number for line in self.lines)
assert len(page_number_set) < 2, "TextBlock spans multiple pages"
if page_number_set:
self.page_number = page_number_set.pop()
else:
self.page_number = None
def concatenate(self, other: TextBlock) -> TextBlock:
"""Concatenate two text blocks.
Args:
other (TextBlock): The other text block.
Returns:
TextBlock: The concatenated text block.
"""
new_lines = []
new_lines.extend(self.lines)
new_lines.extend(other.lines)
return TextBlock(new_lines)
# LGD-288: sometimes indentation is the only significant signal for deciding where we need to split the material
# descriptions of adjacent layers.
def split_based_on_indentation(self) -> list[TextBlock]:
"""Split the text block based on indentation.
Returns:
list[TextBlock]: The split text blocks.
"""
if len(self.lines) == 0:
return []
line_starts = [line.rect.x0 for line in self.lines]
min_line_start = min(line_starts)
max_line_width = max([line.rect.width for line in self.lines])
first_line_start = self.lines[0].rect.x0
indentation_low = min_line_start + 0.02 * max_line_width
indentation_high = min_line_start + 0.2 * max_line_width
# don't do anything if the first line already indented (e.g. centered text)
if first_line_start > indentation_low:
return [self]
# don't do anything if we don't have any lines at a reasonable indentation
# (2%-20% of max width from leftmost edge)
if all(line.rect.x0 < indentation_low or line.rect.x0 > indentation_high for line in self.lines):
return [self]
# split based on indentation
blocks = []
current_block_lines = []
for line in self.lines:
if line.rect.x0 < indentation_low:
# start new block
if current_block_lines:
blocks.append(TextBlock(current_block_lines))
current_block_lines = [line]
else:
# continue block
current_block_lines.append(line)
if current_block_lines:
blocks.append(TextBlock(current_block_lines))
if self.is_terminated_by_line: # if the block was terminated by a line, then the last block should be as well
blocks[-1].is_terminated_by_line = True
return blocks
def _is_legend(self) -> bool:
"""Check if the current block contains / is a legend.
Legends are characterized by having multiple lines of a single word (e.g. "sand", "kies", etc.). Furthermore
these words are usually aligned in either the x or y direction.
Returns:
bool: Whether the block is or contains a legend.
"""
y0_coordinates = []
x0_coordinates = []
number_horizontally_close = 0
number_vertically_close = 0
for line in self.lines:
if len(line.text.split(" ")) == 1 and not any(
char in line.text for char in [".", ",", ";", ":", "!", "?"]
): # sometimes single words in text are delimited by a punctuation.
if _is_close(line.rect.y0, y0_coordinates, 1):
number_horizontally_close += 1
if _is_close(line.rect.x0, x0_coordinates, 1):
number_vertically_close += 1
x0_coordinates.append(line.rect.x0)
y0_coordinates.append(line.rect.y0)
return number_horizontally_close > 1 or number_vertically_close > 2
def to_json(self) -> dict[str, Any]:
"""Convert the TextBlock object to a JSON serializable dictionary."""
return {
"text": self.text,
"rect": [self.rect.x0, self.rect.y0, self.rect.x1, self.rect.y1],
"lines": [line.to_json() for line in self.lines],
"page": self.page_number,
}
def _is_close(a: float, b: list, tolerance: float) -> bool:
return any(abs(a - c) < tolerance for c in b)
def block_distance(block1: TextBlock, block2: TextBlock) -> float:
"""Calculate the distance between two text blocks.
The distance is calculated as the difference between the y-coordinates of the bottom of the first block
and the top of the second block.
If a block is terminated by a line, the distance to the next block is set to infinity.
This ensures that the block is not merged with the next block.
Args:
block1 (TextBlock): The first text block.
block2 (TextBlock): The second text block.
Returns:
float: The distance between the two text blocks.
"""
if block1.is_terminated_by_line:
return np.inf
else:
return block2.rect.y0 - block1.rect.y1