Skip to content

Commit

Permalink
Use generators instead of lists and remove extra calls (#34)
Browse files Browse the repository at this point in the history
* enhancement: Use generators instead of lists and remove extra calls

* remove gc.collect call and clear xml tree element
  • Loading branch information
krishnasism authored Jan 16, 2024
1 parent 6979914 commit e720997
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 24 deletions.
2 changes: 1 addition & 1 deletion hotpdf/helpers/nanoid.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def method(algorithm: Callable[[int], bytearray], alphabet: str, size: int) -> s
mask = 1
if alphabet_len > 1:
mask = (2 << int(log(alphabet_len - 1) / log(2))) - 1
step = int(ceil(1.6 * mask * size / alphabet_len))
step = ceil(1.6 * mask * size / alphabet_len)

id = ""
while True:
Expand Down
3 changes: 1 addition & 2 deletions hotpdf/hotpdf.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import gc
import logging
import math
import os
Expand Down Expand Up @@ -97,8 +96,8 @@ def load(
parsed_page.build_memory_map()
parsed_page.load_memory_map(page=element, drop_duplicate_spans=drop_duplicate_spans)
self.pages.append(parsed_page)
element.clear()
root.clear()
gc.collect()

def __extract_full_text_span(
self,
Expand Down
36 changes: 15 additions & 21 deletions hotpdf/memory_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,25 +63,22 @@ def display_memory_map(self, save: bool = False, filename: str = "memory_map.txt
def __get_page_spans(self, page: ET.Element) -> Generator[ET.Element, None, None]:
return page.iterfind(".//span")

def __get_page_chars(self, page: ET.Element) -> list[ET.Element]:
return page.findall(".//char")
def __get_page_chars(self, page: ET.Element) -> Generator[ET.Element, None, None]:
return page.iterfind(".//char")

def __get_span_chars(self, spans: Generator[ET.Element, None, None], drop_duplicate_spans: bool) -> list[ET.Element]:
chars: list[ET.Element] = []
def __get_span_chars(self, spans: Generator[ET.Element, None, None], drop_duplicate_spans: bool) -> Generator[ET.Element, None, None]:
seen_span_hashes: set[str] = set()
for span in spans:
span_id: str = generate_nano_id(size=10)
span_chars: list[ET.Element] = span.findall(".//")
span_hash: str = md5(f"{str(span.attrib)}|{str([_char.attrib for _char in span_chars])}".encode()).hexdigest()
span_hash: str = md5(f"{str(span.attrib)}|{str(_char.attrib for _char in span.iterfind('.//'))}".encode()).hexdigest()
if drop_duplicate_spans:
if span_hash in seen_span_hashes:
continue
seen_span_hashes.add(span_hash)
for char in span_chars:
for char in span.iterfind(".//"):
char.set("span_id", span_id)
chars.append(char)
yield char
del seen_span_hashes
return chars

def load_memory_map(self, page: ET.Element, drop_duplicate_spans: bool = True) -> None:
"""
Expand All @@ -95,7 +92,7 @@ def load_memory_map(self, page: ET.Element, drop_duplicate_spans: bool = True) -
"""
char_hot_characters: list[tuple[str, HotCharacter]] = []
spans: Generator[ET.Element, None, None] = self.__get_page_spans(page)
chars: list[ET.Element] = []
chars: Generator[ET.Element, None, None]
if spans:
chars = self.__get_span_chars(
spans=spans,
Expand All @@ -105,12 +102,12 @@ def load_memory_map(self, page: ET.Element, drop_duplicate_spans: bool = True) -
chars = self.__get_page_chars(page)
for char in chars:
char_bbox = char.attrib["bbox"]
char_x0, char_y0, char_x1, _ = [float(char_coord) for char_coord in char_bbox.split()]
char_x0, char_y0, char_x1, _ = map(float, char_bbox.split())
char_c = char.attrib["c"]
char_span_id = char.attrib.get("span_id")
cell_x = int(math.floor(char_x0))
cell_y = int(math.floor(char_y0))
cell_x_end = int(math.ceil(char_x1))
cell_x = math.floor(char_x0)
cell_y = math.floor(char_y0)
cell_x_end = math.ceil(char_x1)
hot_character = HotCharacter(
value=char_c,
x=cell_x,
Expand All @@ -121,9 +118,6 @@ def load_memory_map(self, page: ET.Element, drop_duplicate_spans: bool = True) -
if not 0 < cell_x or not 0 < cell_y:
continue

if self.memory_map.get(row_idx=cell_y, column_idx=cell_x) != "":
cell_x += 1
char_x1 += 1
self.memory_map.insert(value=char_c, row_idx=cell_y, column_idx=cell_x)
char_hot_characters.append((
char_c,
Expand Down Expand Up @@ -152,10 +146,10 @@ def extract_text_from_bbox(self, x0: float, x1: float, y0: float, y1: float) ->
Returns:
str: Extracted text within the bounding box.
"""
cell_x0 = int(math.floor(x0))
cell_x1 = int(math.ceil(x1))
cell_y0 = int(math.floor(y0))
cell_y1 = int(math.ceil(y1))
cell_x0 = math.floor(x0)
cell_x1 = math.ceil(x1)
cell_y0 = math.floor(y0)
cell_y1 = math.ceil(y1)

extracted_text = ""
for row in range(cell_y0, cell_y1 + 1):
Expand Down

0 comments on commit e720997

Please sign in to comment.