From 1ef9d4ba163a3de683e21c85d86fc1d0bc8926c0 Mon Sep 17 00:00:00 2001 From: Mattia Callegari Date: Wed, 24 Jan 2024 15:26:47 +0100 Subject: [PATCH 1/5] feat: :fire: use piping instead of temp_files --- hotpdf/hotpdf.py | 63 ++++++++++++++--------------------- hotpdf/processor.py | 80 ++++++++++++++++++++++++++++++++------------- 2 files changed, 82 insertions(+), 61 deletions(-) diff --git a/hotpdf/hotpdf.py b/hotpdf/hotpdf.py index 1961451..56af770 100644 --- a/hotpdf/hotpdf.py +++ b/hotpdf/hotpdf.py @@ -1,7 +1,5 @@ import math import os -import tempfile -import xml.etree.ElementTree as ET from collections import defaultdict from typing import Optional, Union @@ -16,20 +14,33 @@ class HotPdf: def __init__( self, extraction_tolerance: int = 4, + source: Union[str, bytes, None] = None, + password: str = "", + drop_duplicate_spans: bool = True, + first_page: int = 0, + last_page: int = 0, ) -> None: """Initialize the HotPdf class. Args: extraction_tolerance (int, optional): Tolerance value used during text extraction to adjust the bounding box for capturing text. Defaults to 4. + pdf_file (str | bytes, optional): The path to the PDF file to be loaded, or a bytes object. + password (str, optional): Password to use to unlock the pdf + drop_duplicate_spans (bool, optional): Drop duplicate spans when loading. Defaults to True. + first_page (int, optional): The first page to load. Defaults to 0. + last_page (int, optional): The last page to load. Defaults to 0. + + Raises: + ValueError: If the page range is invalid. + FileNotFoundError: If the file is not found. + PermissionError: If the file is encrypted or the password is wrong. + RuntimeError: If an unkown error is generated by Ghostscript. """ self.pages: list[MemoryMap] = [] self.extraction_tolerance: int = extraction_tolerance - self.xml_file_path: Optional[str] = None - - def __delete_file(self, path: Union[str, None]) -> None: - if path and os.path.exists(path): - os.remove(path) + if source: + self.load(source, password, drop_duplicate_spans, first_page, last_page) def __check_file_exists(self, pdf_file: str) -> None: if not os.path.exists(pdf_file): @@ -47,14 +58,14 @@ def __check_page_range(self, first_page: int, last_page: int) -> None: if first_page > last_page or first_page < 0 or last_page < 0: raise ValueError("Invalid page range") - def __prechecks(self, pdf_file: Union[str, bytes], first_page: int, last_page: int) -> None: - if isinstance(pdf_file, str): - self.__check_file_exists(pdf_file) + def __prechecks(self, source: Union[str, bytes], first_page: int, last_page: int) -> None: + if type(source) is str: + self.__check_file_exists(source) self.__check_page_range(first_page, last_page) def load( self, - pdf_file: Union[str, bytes], + source: Union[str, bytes], password: str = "", drop_duplicate_spans: bool = True, first_page: int = 0, @@ -75,34 +86,8 @@ def load( PermissionError: If the file is encrypted or the password is wrong. RuntimeError: If an unkown error is generated by Ghostscript. """ - self.__prechecks(pdf_file, first_page, last_page) - _bytes_file_received: bool = False - if isinstance(pdf_file, bytes): - _temp_pdf_file = tempfile.NamedTemporaryFile(delete=False) - _temp_pdf_file.write(pdf_file) - _temp_pdf_file_name = _temp_pdf_file.name - _bytes_file_received = True - else: - _temp_pdf_file_name = pdf_file - self.xml_file_path = processor.generate_xml_file(_temp_pdf_file_name, password, first_page, last_page) - tree_iterator = ET.iterparse(self.xml_file_path, events=("start", "end")) - event: str - root: ET.Element - event, root = next(tree_iterator) - - element: ET.Element - for event, element in tree_iterator: - if event == "end" and element.tag == "page": - parsed_page: MemoryMap = MemoryMap() - parsed_page.build_memory_map() - parsed_page.load_memory_map(page=element, drop_duplicate_spans=drop_duplicate_spans) - self.pages.append(parsed_page) - element.clear() - root.clear() - - self.__delete_file(self.xml_file_path) - if _bytes_file_received: - self.__delete_file(_temp_pdf_file_name) + self.__prechecks(source, first_page, last_page) + self.pages = processor.process(source, password, drop_duplicate_spans, first_page, last_page) def __extract_full_text_span( self, diff --git a/hotpdf/processor.py b/hotpdf/processor.py index 182e4cd..963418d 100644 --- a/hotpdf/processor.py +++ b/hotpdf/processor.py @@ -4,7 +4,13 @@ import re import subprocess import tempfile +import xml.etree.ElementTree as ET from enum import Enum +from pathlib import Path +from typing import Union + +from hotpdf.helpers import nanoid +from hotpdf.memory_map import MemoryMap class Result(Enum): @@ -14,7 +20,15 @@ class Result(Enum): UNKNOWN_ERROR = 3 -def generate_xml_file(file_path: str, password: str, first_page: int, last_page: int) -> str: +def process( + source: Union[str, bytes], password: str, drop_duplicate_spans: bool, first_page: int, last_page: int +) -> list[MemoryMap]: + xml_file = __generate_xml_file(source, password, first_page, last_page) + pages = __parse_xml(xml_file, drop_duplicate_spans) + return pages + + +def __generate_xml_file(source: Union[str, bytes], password: str, first_page: int, last_page: int) -> Path: """Generate XML notation of PDF File. Args: @@ -28,19 +42,14 @@ def generate_xml_file(file_path: str, password: str, first_page: int, last_page: Returns: str: XML File Path. """ - temp_xml_file_name = tempfile.NamedTemporaryFile(delete=False).name - - result = __call_ghostscript(file_path, temp_xml_file_name, password, first_page, last_page) - + temp_xml_file_path = Path(tempfile.gettempdir(), nanoid.generate_nano_id() + ".xml") + result = __call_ghostscript(source, temp_xml_file_path, password, first_page, last_page) __handle_gs_result(result) - - __clean_xml(temp_xml_file_name) - - return temp_xml_file_name + return __clean_xml(temp_xml_file_path) def __call_ghostscript( - file_path: str, temp_xml_file_name: str, password: str, first_page: int, last_page: int + source: Union[str, bytes], temp_xml_file_path: Path, password: str, first_page: int, last_page: int ) -> Result: ghostscript = "gs" if os.name != "nt" else "gswin64c" command_line_args = [ghostscript, "-dNOPAUSE", "-dBATCH", "-dSAFER", "-dTextFormat=1", "-sDEVICE=txtwrite"] @@ -52,13 +61,19 @@ def __call_ghostscript( if last_page: command_line_args.append(f"-dLastPage={last_page}") - command_line_args.extend([f'-sOutputFile="{temp_xml_file_name}"', f'"{file_path}"']) + command_line_args.append(f'-sOutputFile="{temp_xml_file_path}"') + + # Uses gs in pipe mode + pipe = type(source) is bytes + if pipe: + command_line_args.append("-") + else: + command_line_args.append(str(source)) + gs_call = " ".join(command_line_args) try: - output = subprocess.check_output(gs_call, shell=ghostscript == "gs", stderr=subprocess.STDOUT).decode( - errors="ignore" - ) + output = subprocess.run(gs_call, shell=ghostscript == "gs", input=source if pipe else None, capture_output=True) status = __validate_gs_output(output) except subprocess.CalledProcessError as err: status = Result.UNKNOWN_ERROR @@ -67,7 +82,7 @@ def __call_ghostscript( return status -def __clean_xml(temporary_xml_file_name: str) -> None: +def __clean_xml(temporary_xml_path: Path) -> Path: """ Clean the raw xlm file generated by ghostscript. Apply changes directly to the temporaryfile. @@ -75,7 +90,7 @@ def __clean_xml(temporary_xml_file_name: str) -> None: Args: temporary_xml_file_name (str): the temporary file outputted by ghostscript """ - with open(temporary_xml_file_name, "r+", encoding="utf-8") as f: + with open(temporary_xml_path, "r+", encoding="utf-8") as f: raw_xml = f.read() raw_xml = re.sub(r"(&#x[0-9]+;)", "", raw_xml) raw_xml = re.sub(r"(")", "'", raw_xml) @@ -89,16 +104,18 @@ def __clean_xml(temporary_xml_file_name: str) -> None: f.seek(0) f.write(raw_xml) f.truncate() + return temporary_xml_path -def __validate_gs_output(output: str) -> Result: - if "This file requires a password for access" in output: +def __validate_gs_output(output: subprocess.CompletedProcess[bytes]) -> Result: + if output.returncode != 0: + return Result.UNKNOWN_ERROR + err = output.stderr.decode(errors="ignore") + if "This file requires a password for access" in err: return Result.LOCKED - if "Password did not work" in output: + if "Password did not work" in err: return Result.WRONG_PASSWORD - if "Page" in output: - return Result.LOADED - return Result.UNKNOWN_ERROR + return Result.UNKNOWN_ERROR if "Error" in err else Result.LOADED def __handle_gs_result(status: Result) -> None: @@ -117,3 +134,22 @@ def __handle_gs_result(status: Result) -> None: if status == Result.UNKNOWN_ERROR: logging.error("GS: UNKNOWN ERROR") raise RuntimeError("Unknown error in processing") + + +def __parse_xml(xml_file_path: Path, drop_duplicate_spans: bool) -> list[MemoryMap]: + pages: list[MemoryMap] = [] + tree_iterator = ET.iterparse(xml_file_path, events=("start", "end")) + event: str + root: ET.Element + event, root = next(tree_iterator) + element: ET.Element + for event, element in tree_iterator: + if event == "end" and element.tag == "page": + parsed_page: MemoryMap = MemoryMap() + parsed_page.build_memory_map() + parsed_page.load_memory_map(page=element, drop_duplicate_spans=drop_duplicate_spans) + pages.append(parsed_page) + element.clear() + root.clear() + os.remove(xml_file_path) + return pages From 5083f9fa0b8e474f12c05ad15d6a82d2b8c544aa Mon Sep 17 00:00:00 2001 From: Mattia Callegari Date: Wed, 24 Jan 2024 15:54:51 +0100 Subject: [PATCH 2/5] fix: :test_tube: unit testing with mocks --- hotpdf/processor.py | 2 +- tests/test_functions.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/hotpdf/processor.py b/hotpdf/processor.py index 963418d..99d9f9f 100644 --- a/hotpdf/processor.py +++ b/hotpdf/processor.py @@ -115,7 +115,7 @@ def __validate_gs_output(output: subprocess.CompletedProcess[bytes]) -> Result: return Result.LOCKED if "Password did not work" in err: return Result.WRONG_PASSWORD - return Result.UNKNOWN_ERROR if "Error" in err else Result.LOADED + return Result.LOADED def __handle_gs_result(status: Result) -> None: diff --git a/tests/test_functions.py b/tests/test_functions.py index 66f7111..18ab22f 100644 --- a/tests/test_functions.py +++ b/tests/test_functions.py @@ -98,12 +98,12 @@ def test_duplicate_spans_not_removed(mock_hotpdf_bank_file_name): hot_pdf_object = HotPdf() hot_pdf_object_with_dup_span = HotPdf() with patch( - "hotpdf.processor.generate_xml_file", + "hotpdf.processor.__generate_xml_file", return_value=xml_copy_file_name("tests/resources/xml/hotpdf_bank_dup_span.xml"), ): hot_pdf_object_with_dup_span.load(mock_hotpdf_bank_file_name, drop_duplicate_spans=False) with patch( - "hotpdf.processor.generate_xml_file", + "hotpdf.processor.__generate_xml_file", return_value=xml_copy_file_name("tests/resources/xml/hotpdf_bank_dup_span.xml"), ): hot_pdf_object.load(mock_hotpdf_bank_file_name) @@ -114,7 +114,7 @@ def test_duplicate_spans_not_removed(mock_hotpdf_bank_file_name): def test_load_negative_coordinates(mock_hotpdf_bank_file_name): QUERY = "HOTPDF BANK" with patch( - "hotpdf.processor.generate_xml_file", + "hotpdf.processor.__generate_xml_file", return_value=xml_copy_file_name("tests/resources/xml/hotpdf_bank_negative_coords.xml"), ): hot_pdf_object = HotPdf() @@ -122,7 +122,7 @@ def test_load_negative_coordinates(mock_hotpdf_bank_file_name): assert not hot_pdf_object.find_text(QUERY)[0], "Expected string to be empty" # For sanity: The following file is same as above, except the coords are normal with patch( - "hotpdf.processor.generate_xml_file", + "hotpdf.processor.__generate_xml_file", return_value=xml_copy_file_name("tests/resources/xml/hotpdf_bank_normal_coords.xml"), ): hot_pdf_object_normal = HotPdf() From 3a1ddd3697915c7734b6ec4e62811eca6c404848 Mon Sep 17 00:00:00 2001 From: Mattia Callegari Date: Wed, 24 Jan 2024 16:57:28 +0100 Subject: [PATCH 3/5] test: :test_tube: add constructor loading test --- .github/workflows/test.yml | 2 +- hotpdf/processor.py | 5 +---- pyproject.toml | 2 +- tests/test_benchmark.py | 5 +++-- tests/test_load.py | 5 +++++ 5 files changed, 11 insertions(+), 8 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 9392a70..5de8006 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -64,7 +64,7 @@ jobs: run: pip install -e '.[dev]' - name: Run tests with coverage run: - python -m pytest --cov-fail-under=98 + python -m pytest --cov-fail-under=98 -n=auto - name: Upload coverage to coveralls if: github.event_name == 'push' uses: coverallsapp/github-action@v2.2.3 diff --git a/hotpdf/processor.py b/hotpdf/processor.py index 99d9f9f..b93303d 100644 --- a/hotpdf/processor.py +++ b/hotpdf/processor.py @@ -65,10 +65,7 @@ def __call_ghostscript( # Uses gs in pipe mode pipe = type(source) is bytes - if pipe: - command_line_args.append("-") - else: - command_line_args.append(str(source)) + command_line_args.append("-" if pipe else str(source)) gs_call = " ".join(command_line_args) diff --git a/pyproject.toml b/pyproject.toml index 317826c..0602255 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -89,4 +89,4 @@ strict = true [tool.pytest.ini_options] log_cli=true log_level="NOTSET" -addopts="-n=auto --cov --cov-report term-missing" +addopts="--cov --cov-report term-missing" diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index b42cc0b..308ca65 100644 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -26,7 +26,8 @@ def perform_speed_test(file_name, expected_processing_seconds): hot_pdf_object = HotPdf() hot_pdf_object.load(file_name) end_time = time.time() - assert (end_time - start_time) < expected_processing_seconds, "Benchmark time exceeded!" + elapsed = end_time - start_time + assert (elapsed) < expected_processing_seconds, "Benchmark time exceeded!" def perform_memory_test(file_name, expected_peak_memory): @@ -39,7 +40,7 @@ def perform_memory_test(file_name, expected_peak_memory): def test_speed_benchmark_multiple_pages(multiple_pages_file_name): - perform_speed_test(multiple_pages_file_name, 4) + perform_speed_test(multiple_pages_file_name, 3) def test_memory_benchmark_multiple_pages(multiple_pages_file_name): diff --git a/tests/test_load.py b/tests/test_load.py index 04d1e1f..f3d4207 100644 --- a/tests/test_load.py +++ b/tests/test_load.py @@ -39,6 +39,11 @@ def test_load(valid_file_name): hot_pdf_object.load(valid_file_name) +def test_load_constructor(valid_file_name): + hotpdf_obj = HotPdf(source=valid_file_name) + assert len(hotpdf_obj.pages) > 0 + + def test_load_bytes(valid_file_name): with open(valid_file_name, "rb") as f: hot_pdf_object = HotPdf() From cc4bec44aaaa4988cb7926100d7573f5f6e0fcc5 Mon Sep 17 00:00:00 2001 From: Mattia Callegari Date: Wed, 24 Jan 2024 17:55:25 +0100 Subject: [PATCH 4/5] refactor: :construction: renaming variables --- hotpdf/hotpdf.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/hotpdf/hotpdf.py b/hotpdf/hotpdf.py index 56af770..7061e8e 100644 --- a/hotpdf/hotpdf.py +++ b/hotpdf/hotpdf.py @@ -14,7 +14,7 @@ class HotPdf: def __init__( self, extraction_tolerance: int = 4, - source: Union[str, bytes, None] = None, + pdf_file: Union[str, bytes, None] = None, password: str = "", drop_duplicate_spans: bool = True, first_page: int = 0, @@ -39,8 +39,8 @@ def __init__( """ self.pages: list[MemoryMap] = [] self.extraction_tolerance: int = extraction_tolerance - if source: - self.load(source, password, drop_duplicate_spans, first_page, last_page) + if pdf_file: + self.load(pdf_file, password, drop_duplicate_spans, first_page, last_page) def __check_file_exists(self, pdf_file: str) -> None: if not os.path.exists(pdf_file): @@ -58,14 +58,14 @@ def __check_page_range(self, first_page: int, last_page: int) -> None: if first_page > last_page or first_page < 0 or last_page < 0: raise ValueError("Invalid page range") - def __prechecks(self, source: Union[str, bytes], first_page: int, last_page: int) -> None: - if type(source) is str: - self.__check_file_exists(source) + def __prechecks(self, pdf_file: Union[str, bytes], first_page: int, last_page: int) -> None: + if type(pdf_file) is str: + self.__check_file_exists(pdf_file) self.__check_page_range(first_page, last_page) def load( self, - source: Union[str, bytes], + pdf_file: Union[str, bytes], password: str = "", drop_duplicate_spans: bool = True, first_page: int = 0, @@ -86,8 +86,8 @@ def load( PermissionError: If the file is encrypted or the password is wrong. RuntimeError: If an unkown error is generated by Ghostscript. """ - self.__prechecks(source, first_page, last_page) - self.pages = processor.process(source, password, drop_duplicate_spans, first_page, last_page) + self.__prechecks(pdf_file, first_page, last_page) + self.pages = processor.process(pdf_file, password, drop_duplicate_spans, first_page, last_page) def __extract_full_text_span( self, From 9af34cab29ef70dc9c5c2355c061fc0e7fda458b Mon Sep 17 00:00:00 2001 From: Mattia Callegari Date: Wed, 24 Jan 2024 18:03:57 +0100 Subject: [PATCH 5/5] fix: :boom: change args order for easier use, test --- hotpdf/hotpdf.py | 6 +++--- tests/test_load.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/hotpdf/hotpdf.py b/hotpdf/hotpdf.py index 7061e8e..90e1c17 100644 --- a/hotpdf/hotpdf.py +++ b/hotpdf/hotpdf.py @@ -13,23 +13,23 @@ class HotPdf: def __init__( self, - extraction_tolerance: int = 4, pdf_file: Union[str, bytes, None] = None, password: str = "", drop_duplicate_spans: bool = True, first_page: int = 0, last_page: int = 0, + extraction_tolerance: int = 4, ) -> None: """Initialize the HotPdf class. Args: - extraction_tolerance (int, optional): Tolerance value used during text extraction - to adjust the bounding box for capturing text. Defaults to 4. pdf_file (str | bytes, optional): The path to the PDF file to be loaded, or a bytes object. password (str, optional): Password to use to unlock the pdf drop_duplicate_spans (bool, optional): Drop duplicate spans when loading. Defaults to True. first_page (int, optional): The first page to load. Defaults to 0. last_page (int, optional): The last page to load. Defaults to 0. + extraction_tolerance (int, optional): Tolerance value used during text extraction + to adjust the bounding box for capturing text. Defaults to 4. Raises: ValueError: If the page range is invalid. diff --git a/tests/test_load.py b/tests/test_load.py index f3d4207..4a6f707 100644 --- a/tests/test_load.py +++ b/tests/test_load.py @@ -40,7 +40,7 @@ def test_load(valid_file_name): def test_load_constructor(valid_file_name): - hotpdf_obj = HotPdf(source=valid_file_name) + hotpdf_obj = HotPdf(valid_file_name) assert len(hotpdf_obj.pages) > 0