diff --git a/hotpdf/helpers/nanoid.py b/hotpdf/helpers/nanoid.py index c632a04..25ef87d 100644 --- a/hotpdf/helpers/nanoid.py +++ b/hotpdf/helpers/nanoid.py @@ -17,7 +17,7 @@ def method(algorithm: Callable[[int], bytearray], alphabet: str, size: int) -> s mask = 1 if alphabet_len > 1: mask = (2 << int(log(alphabet_len - 1) / log(2))) - 1 - step = int(ceil(1.6 * mask * size / alphabet_len)) + step = ceil(1.6 * mask * size / alphabet_len) id = "" while True: diff --git a/hotpdf/hotpdf.py b/hotpdf/hotpdf.py index 0fd458a..b8de80d 100644 --- a/hotpdf/hotpdf.py +++ b/hotpdf/hotpdf.py @@ -1,4 +1,3 @@ -import gc import logging import math import os @@ -97,8 +96,8 @@ def load( parsed_page.build_memory_map() parsed_page.load_memory_map(page=element, drop_duplicate_spans=drop_duplicate_spans) self.pages.append(parsed_page) + element.clear() root.clear() - gc.collect() def __extract_full_text_span( self, diff --git a/hotpdf/memory_map.py b/hotpdf/memory_map.py index e2bb4ce..2bdc11f 100644 --- a/hotpdf/memory_map.py +++ b/hotpdf/memory_map.py @@ -63,25 +63,22 @@ def display_memory_map(self, save: bool = False, filename: str = "memory_map.txt def __get_page_spans(self, page: ET.Element) -> Generator[ET.Element, None, None]: return page.iterfind(".//span") - def __get_page_chars(self, page: ET.Element) -> list[ET.Element]: - return page.findall(".//char") + def __get_page_chars(self, page: ET.Element) -> Generator[ET.Element, None, None]: + return page.iterfind(".//char") - def __get_span_chars(self, spans: Generator[ET.Element, None, None], drop_duplicate_spans: bool) -> list[ET.Element]: - chars: list[ET.Element] = [] + def __get_span_chars(self, spans: Generator[ET.Element, None, None], drop_duplicate_spans: bool) -> Generator[ET.Element, None, None]: seen_span_hashes: set[str] = set() for span in spans: span_id: str = generate_nano_id(size=10) - span_chars: list[ET.Element] = span.findall(".//") - span_hash: str = md5(f"{str(span.attrib)}|{str([_char.attrib for _char in span_chars])}".encode()).hexdigest() + span_hash: str = md5(f"{str(span.attrib)}|{str(_char.attrib for _char in span.iterfind('.//'))}".encode()).hexdigest() if drop_duplicate_spans: if span_hash in seen_span_hashes: continue seen_span_hashes.add(span_hash) - for char in span_chars: + for char in span.iterfind(".//"): char.set("span_id", span_id) - chars.append(char) + yield char del seen_span_hashes - return chars def load_memory_map(self, page: ET.Element, drop_duplicate_spans: bool = True) -> None: """ @@ -95,7 +92,7 @@ def load_memory_map(self, page: ET.Element, drop_duplicate_spans: bool = True) - """ char_hot_characters: list[tuple[str, HotCharacter]] = [] spans: Generator[ET.Element, None, None] = self.__get_page_spans(page) - chars: list[ET.Element] = [] + chars: Generator[ET.Element, None, None] if spans: chars = self.__get_span_chars( spans=spans, @@ -105,12 +102,12 @@ def load_memory_map(self, page: ET.Element, drop_duplicate_spans: bool = True) - chars = self.__get_page_chars(page) for char in chars: char_bbox = char.attrib["bbox"] - char_x0, char_y0, char_x1, _ = [float(char_coord) for char_coord in char_bbox.split()] + char_x0, char_y0, char_x1, _ = map(float, char_bbox.split()) char_c = char.attrib["c"] char_span_id = char.attrib.get("span_id") - cell_x = int(math.floor(char_x0)) - cell_y = int(math.floor(char_y0)) - cell_x_end = int(math.ceil(char_x1)) + cell_x = math.floor(char_x0) + cell_y = math.floor(char_y0) + cell_x_end = math.ceil(char_x1) hot_character = HotCharacter( value=char_c, x=cell_x, @@ -121,9 +118,6 @@ def load_memory_map(self, page: ET.Element, drop_duplicate_spans: bool = True) - if not 0 < cell_x or not 0 < cell_y: continue - if self.memory_map.get(row_idx=cell_y, column_idx=cell_x) != "": - cell_x += 1 - char_x1 += 1 self.memory_map.insert(value=char_c, row_idx=cell_y, column_idx=cell_x) char_hot_characters.append(( char_c, @@ -152,10 +146,10 @@ def extract_text_from_bbox(self, x0: float, x1: float, y0: float, y1: float) -> Returns: str: Extracted text within the bounding box. """ - cell_x0 = int(math.floor(x0)) - cell_x1 = int(math.ceil(x1)) - cell_y0 = int(math.floor(y0)) - cell_y1 = int(math.ceil(y1)) + cell_x0 = math.floor(x0) + cell_x1 = math.ceil(x1) + cell_y0 = math.floor(y0) + cell_y1 = math.ceil(y1) extracted_text = "" for row in range(cell_y0, cell_y1 + 1):