Merge pull request #72 from callegarimattia/feat/loading_with_constru…

…ctor feat: load with constructor, use piping instead of temp_files
weareprestatech · Jan 25, 2024 · a6df8b2 · a6df8b2
2 parents 7822f0c + 9af34ca
commit a6df8b2
Show file tree

Hide file tree

Showing 7 changed files with 89 additions and 65 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -64,7 +64,7 @@ jobs:
         run: pip install -e '.[dev]'
       - name: Run tests with coverage
         run:
-          python -m pytest --cov-fail-under=98
+          python -m pytest --cov-fail-under=98 -n=auto
       - name: Upload coverage to coveralls
         if: github.event_name == 'push'
         uses: coverallsapp/github-action@v2.2.3
diff --git a/hotpdf/hotpdf.py b/hotpdf/hotpdf.py
@@ -1,7 +1,5 @@
 import math
 import os
-import tempfile
-import xml.etree.ElementTree as ET
 from collections import defaultdict
 from typing import Optional, Union
 
@@ -15,21 +13,34 @@
 class HotPdf:
     def __init__(
         self,
+        pdf_file: Union[str, bytes, None] = None,
+        password: str = "",
+        drop_duplicate_spans: bool = True,
+        first_page: int = 0,
+        last_page: int = 0,
         extraction_tolerance: int = 4,
     ) -> None:
         """Initialize the HotPdf class.
 
         Args:
+            pdf_file (str | bytes, optional): The path to the PDF file to be loaded, or a bytes object.
+            password (str, optional): Password to use to unlock the pdf
+            drop_duplicate_spans (bool, optional): Drop duplicate spans when loading. Defaults to True.
+            first_page (int, optional): The first page to load. Defaults to 0.
+            last_page (int, optional): The last page to load. Defaults to 0.
             extraction_tolerance (int, optional): Tolerance value used during text extraction
                 to adjust the bounding box for capturing text. Defaults to 4.
+
+        Raises:
+            ValueError: If the page range is invalid.
+            FileNotFoundError: If the file is not found.
+            PermissionError: If the file is encrypted or the password is wrong.
+            RuntimeError: If an unkown error is generated by Ghostscript.
         """
         self.pages: list[MemoryMap] = []
         self.extraction_tolerance: int = extraction_tolerance
-        self.xml_file_path: Optional[str] = None
-
-    def __delete_file(self, path: Union[str, None]) -> None:
-        if path and os.path.exists(path):
-            os.remove(path)
+        if pdf_file:
+            self.load(pdf_file, password, drop_duplicate_spans, first_page, last_page)
 
     def __check_file_exists(self, pdf_file: str) -> None:
         if not os.path.exists(pdf_file):
@@ -48,7 +59,7 @@ def __check_page_range(self, first_page: int, last_page: int) -> None:
             raise ValueError("Invalid page range")
 
     def __prechecks(self, pdf_file: Union[str, bytes], first_page: int, last_page: int) -> None:
-        if isinstance(pdf_file, str):
+        if type(pdf_file) is str:
             self.__check_file_exists(pdf_file)
         self.__check_page_range(first_page, last_page)
 
@@ -76,33 +87,7 @@ def load(
             RuntimeError: If an unkown error is generated by Ghostscript.
         """
         self.__prechecks(pdf_file, first_page, last_page)
-        _bytes_file_received: bool = False
-        if isinstance(pdf_file, bytes):
-            _temp_pdf_file = tempfile.NamedTemporaryFile(delete=False)
-            _temp_pdf_file.write(pdf_file)
-            _temp_pdf_file_name = _temp_pdf_file.name
-            _bytes_file_received = True
-        else:
-            _temp_pdf_file_name = pdf_file
-        self.xml_file_path = processor.generate_xml_file(_temp_pdf_file_name, password, first_page, last_page)
-        tree_iterator = ET.iterparse(self.xml_file_path, events=("start", "end"))
-        event: str
-        root: ET.Element
-        event, root = next(tree_iterator)
-
-        element: ET.Element
-        for event, element in tree_iterator:
-            if event == "end" and element.tag == "page":
-                parsed_page: MemoryMap = MemoryMap()
-                parsed_page.build_memory_map()
-                parsed_page.load_memory_map(page=element, drop_duplicate_spans=drop_duplicate_spans)
-                self.pages.append(parsed_page)
-                element.clear()
-            root.clear()
-
-        self.__delete_file(self.xml_file_path)
-        if _bytes_file_received:
-            self.__delete_file(_temp_pdf_file_name)
+        self.pages = processor.process(pdf_file, password, drop_duplicate_spans, first_page, last_page)
 
     def __extract_full_text_span(
         self,

diff --git a/hotpdf/processor.py b/hotpdf/processor.py
@@ -4,7 +4,13 @@
 import re
 import subprocess
 import tempfile
+import xml.etree.ElementTree as ET
 from enum import Enum
+from pathlib import Path
+from typing import Union
+
+from hotpdf.helpers import nanoid
+from hotpdf.memory_map import MemoryMap
 
 
 class Result(Enum):
@@ -14,7 +20,15 @@ class Result(Enum):
     UNKNOWN_ERROR = 3
 
 
-def generate_xml_file(file_path: str, password: str, first_page: int, last_page: int) -> str:
+def process(
+    source: Union[str, bytes], password: str, drop_duplicate_spans: bool, first_page: int, last_page: int
+) -> list[MemoryMap]:
+    xml_file = __generate_xml_file(source, password, first_page, last_page)
+    pages = __parse_xml(xml_file, drop_duplicate_spans)
+    return pages
+
+
+def __generate_xml_file(source: Union[str, bytes], password: str, first_page: int, last_page: int) -> Path:
     """Generate XML notation of PDF File.
 
     Args:
@@ -28,19 +42,14 @@ def generate_xml_file(file_path: str, password: str, first_page: int, last_page:
     Returns:
         str: XML File Path.
     """
-    temp_xml_file_name = tempfile.NamedTemporaryFile(delete=False).name
-
-    result = __call_ghostscript(file_path, temp_xml_file_name, password, first_page, last_page)
-
+    temp_xml_file_path = Path(tempfile.gettempdir(), nanoid.generate_nano_id() + ".xml")
+    result = __call_ghostscript(source, temp_xml_file_path, password, first_page, last_page)
     __handle_gs_result(result)
-
-    __clean_xml(temp_xml_file_name)
-
-    return temp_xml_file_name
+    return __clean_xml(temp_xml_file_path)
 
 
 def __call_ghostscript(
-    file_path: str, temp_xml_file_name: str, password: str, first_page: int, last_page: int
+    source: Union[str, bytes], temp_xml_file_path: Path, password: str, first_page: int, last_page: int
 ) -> Result:
     ghostscript = "gs" if os.name != "nt" else "gswin64c"
     command_line_args = [ghostscript, "-dNOPAUSE", "-dBATCH", "-dSAFER", "-dTextFormat=1", "-sDEVICE=txtwrite"]
@@ -52,13 +61,16 @@ def __call_ghostscript(
     if last_page:
         command_line_args.append(f"-dLastPage={last_page}")
 
-    command_line_args.extend([f'-sOutputFile="{temp_xml_file_name}"', f'"{file_path}"'])
+    command_line_args.append(f'-sOutputFile="{temp_xml_file_path}"')
+
+    # Uses gs in pipe mode
+    pipe = type(source) is bytes
+    command_line_args.append("-" if pipe else str(source))
+
     gs_call = " ".join(command_line_args)
 
     try:
-        output = subprocess.check_output(gs_call, shell=ghostscript == "gs", stderr=subprocess.STDOUT).decode(
-            errors="ignore"
-        )
+        output = subprocess.run(gs_call, shell=ghostscript == "gs", input=source if pipe else None, capture_output=True)
         status = __validate_gs_output(output)
     except subprocess.CalledProcessError as err:
         status = Result.UNKNOWN_ERROR
@@ -67,15 +79,15 @@ def __call_ghostscript(
     return status
 
 
-def __clean_xml(temporary_xml_file_name: str) -> None:
+def __clean_xml(temporary_xml_path: Path) -> Path:
     """
     Clean the raw xlm file generated by ghostscript.
     Apply changes directly to the temporaryfile.
 
     Args:
         temporary_xml_file_name (str): the temporary file outputted by ghostscript
     """
-    with open(temporary_xml_file_name, "r+", encoding="utf-8") as f:
+    with open(temporary_xml_path, "r+", encoding="utf-8") as f:
         raw_xml = f.read()
         raw_xml = re.sub(r"(&#x[0-9]+;)", "", raw_xml)
         raw_xml = re.sub(r"(&quot;)", "'", raw_xml)
@@ -89,16 +101,18 @@ def __clean_xml(temporary_xml_file_name: str) -> None:
         f.seek(0)
         f.write(raw_xml)
         f.truncate()
+    return temporary_xml_path
 
 
-def __validate_gs_output(output: str) -> Result:
-    if "This file requires a password for access" in output:
+def __validate_gs_output(output: subprocess.CompletedProcess[bytes]) -> Result:
+    if output.returncode != 0:
+        return Result.UNKNOWN_ERROR
+    err = output.stderr.decode(errors="ignore")
+    if "This file requires a password for access" in err:
         return Result.LOCKED
-    if "Password did not work" in output:
+    if "Password did not work" in err:
         return Result.WRONG_PASSWORD
-    if "Page" in output:
-        return Result.LOADED
-    return Result.UNKNOWN_ERROR
+    return Result.LOADED
 
 
 def __handle_gs_result(status: Result) -> None:
@@ -117,3 +131,22 @@ def __handle_gs_result(status: Result) -> None:
     if status == Result.UNKNOWN_ERROR:
         logging.error("GS: UNKNOWN ERROR")
         raise RuntimeError("Unknown error in processing")
+
+
+def __parse_xml(xml_file_path: Path, drop_duplicate_spans: bool) -> list[MemoryMap]:
+    pages: list[MemoryMap] = []
+    tree_iterator = ET.iterparse(xml_file_path, events=("start", "end"))
+    event: str
+    root: ET.Element
+    event, root = next(tree_iterator)
+    element: ET.Element
+    for event, element in tree_iterator:
+        if event == "end" and element.tag == "page":
+            parsed_page: MemoryMap = MemoryMap()
+            parsed_page.build_memory_map()
+            parsed_page.load_memory_map(page=element, drop_duplicate_spans=drop_duplicate_spans)
+            pages.append(parsed_page)
+            element.clear()
+        root.clear()
+    os.remove(xml_file_path)
+    return pages
diff --git a/pyproject.toml b/pyproject.toml
@@ -89,4 +89,4 @@ strict = true
 [tool.pytest.ini_options]
 log_cli=true
 log_level="NOTSET"
-addopts="-n=auto --cov --cov-report term-missing"
+addopts="--cov --cov-report term-missing"
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
@@ -26,7 +26,8 @@ def perform_speed_test(file_name, expected_processing_seconds):
     hot_pdf_object = HotPdf()
     hot_pdf_object.load(file_name)
     end_time = time.time()
-    assert (end_time - start_time) < expected_processing_seconds, "Benchmark time exceeded!"
+    elapsed = end_time - start_time
+    assert (elapsed) < expected_processing_seconds, "Benchmark time exceeded!"
 
 
 def perform_memory_test(file_name, expected_peak_memory):
@@ -39,7 +40,7 @@ def perform_memory_test(file_name, expected_peak_memory):
 
 
 def test_speed_benchmark_multiple_pages(multiple_pages_file_name):
-    perform_speed_test(multiple_pages_file_name, 4)
+    perform_speed_test(multiple_pages_file_name, 3)
 
 
 def test_memory_benchmark_multiple_pages(multiple_pages_file_name):

diff --git a/tests/test_functions.py b/tests/test_functions.py
@@ -98,12 +98,12 @@ def test_duplicate_spans_not_removed(mock_hotpdf_bank_file_name):
     hot_pdf_object = HotPdf()
     hot_pdf_object_with_dup_span = HotPdf()
     with patch(
-        "hotpdf.processor.generate_xml_file",
+        "hotpdf.processor.__generate_xml_file",
         return_value=xml_copy_file_name("tests/resources/xml/hotpdf_bank_dup_span.xml"),
     ):
         hot_pdf_object_with_dup_span.load(mock_hotpdf_bank_file_name, drop_duplicate_spans=False)
     with patch(
-        "hotpdf.processor.generate_xml_file",
+        "hotpdf.processor.__generate_xml_file",
         return_value=xml_copy_file_name("tests/resources/xml/hotpdf_bank_dup_span.xml"),
     ):
         hot_pdf_object.load(mock_hotpdf_bank_file_name)
@@ -114,15 +114,15 @@ def test_duplicate_spans_not_removed(mock_hotpdf_bank_file_name):
 def test_load_negative_coordinates(mock_hotpdf_bank_file_name):
     QUERY = "HOTPDF BANK"
     with patch(
-        "hotpdf.processor.generate_xml_file",
+        "hotpdf.processor.__generate_xml_file",
         return_value=xml_copy_file_name("tests/resources/xml/hotpdf_bank_negative_coords.xml"),
     ):
         hot_pdf_object = HotPdf()
         hot_pdf_object.load(mock_hotpdf_bank_file_name)
         assert not hot_pdf_object.find_text(QUERY)[0], "Expected string to be empty"
     # For sanity: The following file is same as above, except the coords are normal
     with patch(
-        "hotpdf.processor.generate_xml_file",
+        "hotpdf.processor.__generate_xml_file",
         return_value=xml_copy_file_name("tests/resources/xml/hotpdf_bank_normal_coords.xml"),
     ):
         hot_pdf_object_normal = HotPdf()

diff --git a/tests/test_load.py b/tests/test_load.py
@@ -39,6 +39,11 @@ def test_load(valid_file_name):
     hot_pdf_object.load(valid_file_name)
 
 
+def test_load_constructor(valid_file_name):
+    hotpdf_obj = HotPdf(valid_file_name)
+    assert len(hotpdf_obj.pages) > 0
+
+
 def test_load_bytes(valid_file_name):
     with open(valid_file_name, "rb") as f:
         hot_pdf_object = HotPdf()