-
Notifications
You must be signed in to change notification settings - Fork 2
/
description_block_splitter.py
171 lines (133 loc) · 6.86 KB
/
description_block_splitter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
"""Classes for partitioning material descriptions text into blocks."""
import abc
import fitz
import numpy as np
from stratigraphy.util.dataclasses import Line
from stratigraphy.util.line import TextLine
from stratigraphy.util.textblock import TextBlock
class DescriptionBlockSplitter(metaclass=abc.ABCMeta):
"""Abstract class for splitting material descriptions into blocks based on a certain condition."""
set_terminated_by_line_flag: bool
@abc.abstractmethod
def __init__(self): # noqa: D107
pass
@abc.abstractmethod
def separator_condition(self, last_line: TextLine, current_line: TextLine) -> bool: # noqa: D107
pass
def create_blocks(self, description_lines: list[TextLine]) -> list[TextBlock]:
"""Partition the description lines into blocks.
Args:
description_lines (list[TextLine]): all the text lines from the material descriptions.
Returns:
list[TextBlock]: the list of textblocks
"""
blocks = []
current_block_lines = []
for line in description_lines:
if len(current_block_lines) > 0:
last_line = current_block_lines[-1]
if self.separator_condition(last_line, line):
blocks.append(
TextBlock(current_block_lines, is_terminated_by_line=self.set_terminated_by_line_flag)
)
current_block_lines = []
current_block_lines.append(line)
if len(current_block_lines):
blocks.append(TextBlock(current_block_lines))
return blocks
class SplitDescriptionBlockByLine(DescriptionBlockSplitter):
"""Creates blocks based on longer lines between the material description text."""
def __init__(self, threshold: float, material_description_rect: fitz.Rect, geometric_lines: list[Line]):
"""Create a new SplitDescriptionBlockByLine instance.
Args:
material_description_rect (fitz.Rect): The bounding box for all material descriptions.
threshold (float): Percentage of the block width that needs to be covered by a line.
geometric_lines (list[Line]): The geometric lines detected in the pdf page.
"""
super().__init__()
self.threshold = threshold
self.material_description_rect = material_description_rect
self.geometric_lines = geometric_lines
self.set_terminated_by_line_flag = True
def separator_condition(self, last_line: TextLine, current_line: TextLine) -> bool:
"""Check if a block is separated by a line.
Args:
current_block:
last_line (TextLine): The previous line.
current_line (TextLine): The current line.
Returns:
bool: True if the block is separated by a line, False otherwise.
"""
last_line_y_coordinate = (last_line.rect.y0 + last_line.rect.y1) / 2
current_line_y_coordinate = (current_line.rect.y0 + current_line.rect.y1) / 2
for line in self.geometric_lines:
line_left_x = np.min([line.start.x, line.end.x])
line_right_x = np.max([line.start.x, line.end.x])
line_y_coordinate = (line.start.y + line.end.y) / 2
is_line_long_enough = (
np.min([self.material_description_rect.x1, line_right_x])
- np.max([self.material_description_rect.x0, line_left_x])
> self.threshold * self.material_description_rect.width
)
line_ends_block = last_line_y_coordinate < line_y_coordinate < current_line_y_coordinate
if is_line_long_enough and line_ends_block:
return True
return False
class SplitDescriptionBlockByLeftHandSideSeparator(DescriptionBlockSplitter):
"""Creates blocks based on shorter lines at the left-hand side of the material description text."""
def __init__(self, length_threshold: float, geometric_lines: list[Line]):
"""Create a new SplitDescriptionBlockByLine instance.
Args:
length_threshold (int): The minimum length of a line segment on the left side of a block to split it.
geometric_lines (list[Line]): The geometric lines detected in the pdf page.
"""
super().__init__()
self.length_threshold = length_threshold
self.set_terminated_by_line_flag = False
self.geometric_lines = geometric_lines
def separator_condition(self, last_line: TextLine, current_line: TextLine) -> bool:
"""Check if a block is separated by a line segment on the left side of the block.
Args:
last_line (TextLine): The previous line.
current_line (TextLine): The current line.
Returns:
bool: True if the block is separated by a line segment, False otherwise.
"""
last_line_y_coordinate = (last_line.rect.y0 + last_line.rect.y1) / 2
current_line_y_coordinate = (current_line.rect.y0 + current_line.rect.y1) / 2
for line in self.geometric_lines:
line_y_coordinate = (line.start.y + line.end.y) / 2
line_cuts_lefthandside_of_text = (line.start.x < last_line.rect.x0 < line.end.x) and (
line.start.x < current_line.rect.x0 < line.end.x
)
is_line_long_enough = (
np.abs(line.start.x - line.end.x) > self.length_threshold
) # for the block splitting, we only care about x-extension
line_ends_block = last_line_y_coordinate < line_y_coordinate < current_line_y_coordinate
if line_ends_block and is_line_long_enough and line_cuts_lefthandside_of_text:
return True
return False
class SplitDescriptionBlockByVerticalSpace(DescriptionBlockSplitter):
"""Creates blocks based on vertical spacing between the text lines."""
def __init__(self, threshold: float):
"""Create a new SplitDescriptionBlockByVerticalSpace instance.
Args:
threshold (float): The maximum vertical distance between two lines to be considered part of the same block.
"""
super().__init__()
self.threshold = threshold
self.set_terminated_by_line_flag = False
def separator_condition(self, last_line: TextLine, current_line: TextLine) -> bool:
"""Check if a block is separated by sufficient vertical space.
Args:
last_line (TextLine): The previous line.
current_line (TextLine): The current line.
Returns:
bool: True if the block is separated by sufficient vertical space, False otherwise.
"""
return (
current_line.rect.y0 > last_line.rect.y1 + 5
or ( # upper boundary of line is higher up than lower boundary plus 5 points of last line.
current_line.rect.y0 > last_line.rect.y1 and current_line.rect.y0 > last_line.rect.y0 + self.threshold
)
)