From 1ef9d4ba163a3de683e21c85d86fc1d0bc8926c0 Mon Sep 17 00:00:00 2001
From: Mattia Callegari <mattia.callegari@prestatech.com>
Date: Wed, 24 Jan 2024 15:26:47 +0100
Subject: [PATCH 1/5] feat: :fire: use piping instead of temp_files

---
 hotpdf/hotpdf.py    | 63 ++++++++++++++---------------------
 hotpdf/processor.py | 80 ++++++++++++++++++++++++++++++++-------------
 2 files changed, 82 insertions(+), 61 deletions(-)

diff --git a/hotpdf/hotpdf.py b/hotpdf/hotpdf.py
index 1961451..56af770 100644
--- a/hotpdf/hotpdf.py
+++ b/hotpdf/hotpdf.py
@@ -1,7 +1,5 @@
 import math
 import os
-import tempfile
-import xml.etree.ElementTree as ET
 from collections import defaultdict
 from typing import Optional, Union
 
@@ -16,20 +14,33 @@ class HotPdf:
     def __init__(
         self,
         extraction_tolerance: int = 4,
+        source: Union[str, bytes, None] = None,
+        password: str = "",
+        drop_duplicate_spans: bool = True,
+        first_page: int = 0,
+        last_page: int = 0,
     ) -> None:
         """Initialize the HotPdf class.
 
         Args:
             extraction_tolerance (int, optional): Tolerance value used during text extraction
                 to adjust the bounding box for capturing text. Defaults to 4.
+            pdf_file (str | bytes, optional): The path to the PDF file to be loaded, or a bytes object.
+            password (str, optional): Password to use to unlock the pdf
+            drop_duplicate_spans (bool, optional): Drop duplicate spans when loading. Defaults to True.
+            first_page (int, optional): The first page to load. Defaults to 0.
+            last_page (int, optional): The last page to load. Defaults to 0.
+
+        Raises:
+            ValueError: If the page range is invalid.
+            FileNotFoundError: If the file is not found.
+            PermissionError: If the file is encrypted or the password is wrong.
+            RuntimeError: If an unkown error is generated by Ghostscript.
         """
         self.pages: list[MemoryMap] = []
         self.extraction_tolerance: int = extraction_tolerance
-        self.xml_file_path: Optional[str] = None
-
-    def __delete_file(self, path: Union[str, None]) -> None:
-        if path and os.path.exists(path):
-            os.remove(path)
+        if source:
+            self.load(source, password, drop_duplicate_spans, first_page, last_page)
 
     def __check_file_exists(self, pdf_file: str) -> None:
         if not os.path.exists(pdf_file):
@@ -47,14 +58,14 @@ def __check_page_range(self, first_page: int, last_page: int) -> None:
         if first_page > last_page or first_page < 0 or last_page < 0:
             raise ValueError("Invalid page range")
 
-    def __prechecks(self, pdf_file: Union[str, bytes], first_page: int, last_page: int) -> None:
-        if isinstance(pdf_file, str):
-            self.__check_file_exists(pdf_file)
+    def __prechecks(self, source: Union[str, bytes], first_page: int, last_page: int) -> None:
+        if type(source) is str:
+            self.__check_file_exists(source)
         self.__check_page_range(first_page, last_page)
 
     def load(
         self,
-        pdf_file: Union[str, bytes],
+        source: Union[str, bytes],
         password: str = "",
         drop_duplicate_spans: bool = True,
         first_page: int = 0,
@@ -75,34 +86,8 @@ def load(
             PermissionError: If the file is encrypted or the password is wrong.
             RuntimeError: If an unkown error is generated by Ghostscript.
         """
-        self.__prechecks(pdf_file, first_page, last_page)
-        _bytes_file_received: bool = False
-        if isinstance(pdf_file, bytes):
-            _temp_pdf_file = tempfile.NamedTemporaryFile(delete=False)
-            _temp_pdf_file.write(pdf_file)
-            _temp_pdf_file_name = _temp_pdf_file.name
-            _bytes_file_received = True
-        else:
-            _temp_pdf_file_name = pdf_file
-        self.xml_file_path = processor.generate_xml_file(_temp_pdf_file_name, password, first_page, last_page)
-        tree_iterator = ET.iterparse(self.xml_file_path, events=("start", "end"))
-        event: str
-        root: ET.Element
-        event, root = next(tree_iterator)
-
-        element: ET.Element
-        for event, element in tree_iterator:
-            if event == "end" and element.tag == "page":
-                parsed_page: MemoryMap = MemoryMap()
-                parsed_page.build_memory_map()
-                parsed_page.load_memory_map(page=element, drop_duplicate_spans=drop_duplicate_spans)
-                self.pages.append(parsed_page)
-                element.clear()
-            root.clear()
-
-        self.__delete_file(self.xml_file_path)
-        if _bytes_file_received:
-            self.__delete_file(_temp_pdf_file_name)
+        self.__prechecks(source, first_page, last_page)
+        self.pages = processor.process(source, password, drop_duplicate_spans, first_page, last_page)
 
     def __extract_full_text_span(
         self,
diff --git a/hotpdf/processor.py b/hotpdf/processor.py
index 182e4cd..963418d 100644
--- a/hotpdf/processor.py
+++ b/hotpdf/processor.py
@@ -4,7 +4,13 @@
 import re
 import subprocess
 import tempfile
+import xml.etree.ElementTree as ET
 from enum import Enum
+from pathlib import Path
+from typing import Union
+
+from hotpdf.helpers import nanoid
+from hotpdf.memory_map import MemoryMap
 
 
 class Result(Enum):
@@ -14,7 +20,15 @@ class Result(Enum):
     UNKNOWN_ERROR = 3
 
 
-def generate_xml_file(file_path: str, password: str, first_page: int, last_page: int) -> str:
+def process(
+    source: Union[str, bytes], password: str, drop_duplicate_spans: bool, first_page: int, last_page: int
+) -> list[MemoryMap]:
+    xml_file = __generate_xml_file(source, password, first_page, last_page)
+    pages = __parse_xml(xml_file, drop_duplicate_spans)
+    return pages
+
+
+def __generate_xml_file(source: Union[str, bytes], password: str, first_page: int, last_page: int) -> Path:
     """Generate XML notation of PDF File.
 
     Args:
@@ -28,19 +42,14 @@ def generate_xml_file(file_path: str, password: str, first_page: int, last_page:
     Returns:
         str: XML File Path.
     """
-    temp_xml_file_name = tempfile.NamedTemporaryFile(delete=False).name
-
-    result = __call_ghostscript(file_path, temp_xml_file_name, password, first_page, last_page)
-
+    temp_xml_file_path = Path(tempfile.gettempdir(), nanoid.generate_nano_id() + ".xml")
+    result = __call_ghostscript(source, temp_xml_file_path, password, first_page, last_page)
     __handle_gs_result(result)
-
-    __clean_xml(temp_xml_file_name)
-
-    return temp_xml_file_name
+    return __clean_xml(temp_xml_file_path)
 
 
 def __call_ghostscript(
-    file_path: str, temp_xml_file_name: str, password: str, first_page: int, last_page: int
+    source: Union[str, bytes], temp_xml_file_path: Path, password: str, first_page: int, last_page: int
 ) -> Result:
     ghostscript = "gs" if os.name != "nt" else "gswin64c"
     command_line_args = [ghostscript, "-dNOPAUSE", "-dBATCH", "-dSAFER", "-dTextFormat=1", "-sDEVICE=txtwrite"]
@@ -52,13 +61,19 @@ def __call_ghostscript(
     if last_page:
         command_line_args.append(f"-dLastPage={last_page}")
 
-    command_line_args.extend([f'-sOutputFile="{temp_xml_file_name}"', f'"{file_path}"'])
+    command_line_args.append(f'-sOutputFile="{temp_xml_file_path}"')
+
+    # Uses gs in pipe mode
+    pipe = type(source) is bytes
+    if pipe:
+        command_line_args.append("-")
+    else:
+        command_line_args.append(str(source))
+
     gs_call = " ".join(command_line_args)
 
     try:
-        output = subprocess.check_output(gs_call, shell=ghostscript == "gs", stderr=subprocess.STDOUT).decode(
-            errors="ignore"
-        )
+        output = subprocess.run(gs_call, shell=ghostscript == "gs", input=source if pipe else None, capture_output=True)
         status = __validate_gs_output(output)
     except subprocess.CalledProcessError as err:
         status = Result.UNKNOWN_ERROR
@@ -67,7 +82,7 @@ def __call_ghostscript(
     return status
 
 
-def __clean_xml(temporary_xml_file_name: str) -> None:
+def __clean_xml(temporary_xml_path: Path) -> Path:
     """
     Clean the raw xlm file generated by ghostscript.
     Apply changes directly to the temporaryfile.
@@ -75,7 +90,7 @@ def __clean_xml(temporary_xml_file_name: str) -> None:
     Args:
         temporary_xml_file_name (str): the temporary file outputted by ghostscript
     """
-    with open(temporary_xml_file_name, "r+", encoding="utf-8") as f:
+    with open(temporary_xml_path, "r+", encoding="utf-8") as f:
         raw_xml = f.read()
         raw_xml = re.sub(r"(&#x[0-9]+;)", "", raw_xml)
         raw_xml = re.sub(r"(&quot;)", "'", raw_xml)
@@ -89,16 +104,18 @@ def __clean_xml(temporary_xml_file_name: str) -> None:
         f.seek(0)
         f.write(raw_xml)
         f.truncate()
+    return temporary_xml_path
 
 
-def __validate_gs_output(output: str) -> Result:
-    if "This file requires a password for access" in output:
+def __validate_gs_output(output: subprocess.CompletedProcess[bytes]) -> Result:
+    if output.returncode != 0:
+        return Result.UNKNOWN_ERROR
+    err = output.stderr.decode(errors="ignore")
+    if "This file requires a password for access" in err:
         return Result.LOCKED
-    if "Password did not work" in output:
+    if "Password did not work" in err:
         return Result.WRONG_PASSWORD
-    if "Page" in output:
-        return Result.LOADED
-    return Result.UNKNOWN_ERROR
+    return Result.UNKNOWN_ERROR if "Error" in err else Result.LOADED
 
 
 def __handle_gs_result(status: Result) -> None:
@@ -117,3 +134,22 @@ def __handle_gs_result(status: Result) -> None:
     if status == Result.UNKNOWN_ERROR:
         logging.error("GS: UNKNOWN ERROR")
         raise RuntimeError("Unknown error in processing")
+
+
+def __parse_xml(xml_file_path: Path, drop_duplicate_spans: bool) -> list[MemoryMap]:
+    pages: list[MemoryMap] = []
+    tree_iterator = ET.iterparse(xml_file_path, events=("start", "end"))
+    event: str
+    root: ET.Element
+    event, root = next(tree_iterator)
+    element: ET.Element
+    for event, element in tree_iterator:
+        if event == "end" and element.tag == "page":
+            parsed_page: MemoryMap = MemoryMap()
+            parsed_page.build_memory_map()
+            parsed_page.load_memory_map(page=element, drop_duplicate_spans=drop_duplicate_spans)
+            pages.append(parsed_page)
+            element.clear()
+        root.clear()
+    os.remove(xml_file_path)
+    return pages

From 5083f9fa0b8e474f12c05ad15d6a82d2b8c544aa Mon Sep 17 00:00:00 2001
From: Mattia Callegari <mattia.callegari@prestatech.com>
Date: Wed, 24 Jan 2024 15:54:51 +0100
Subject: [PATCH 2/5] fix: :test_tube: unit testing with mocks

---
 hotpdf/processor.py     | 2 +-
 tests/test_functions.py | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/hotpdf/processor.py b/hotpdf/processor.py
index 963418d..99d9f9f 100644
--- a/hotpdf/processor.py
+++ b/hotpdf/processor.py
@@ -115,7 +115,7 @@ def __validate_gs_output(output: subprocess.CompletedProcess[bytes]) -> Result:
         return Result.LOCKED
     if "Password did not work" in err:
         return Result.WRONG_PASSWORD
-    return Result.UNKNOWN_ERROR if "Error" in err else Result.LOADED
+    return Result.LOADED
 
 
 def __handle_gs_result(status: Result) -> None:
diff --git a/tests/test_functions.py b/tests/test_functions.py
index 66f7111..18ab22f 100644
--- a/tests/test_functions.py
+++ b/tests/test_functions.py
@@ -98,12 +98,12 @@ def test_duplicate_spans_not_removed(mock_hotpdf_bank_file_name):
     hot_pdf_object = HotPdf()
     hot_pdf_object_with_dup_span = HotPdf()
     with patch(
-        "hotpdf.processor.generate_xml_file",
+        "hotpdf.processor.__generate_xml_file",
         return_value=xml_copy_file_name("tests/resources/xml/hotpdf_bank_dup_span.xml"),
     ):
         hot_pdf_object_with_dup_span.load(mock_hotpdf_bank_file_name, drop_duplicate_spans=False)
     with patch(
-        "hotpdf.processor.generate_xml_file",
+        "hotpdf.processor.__generate_xml_file",
         return_value=xml_copy_file_name("tests/resources/xml/hotpdf_bank_dup_span.xml"),
     ):
         hot_pdf_object.load(mock_hotpdf_bank_file_name)
@@ -114,7 +114,7 @@ def test_duplicate_spans_not_removed(mock_hotpdf_bank_file_name):
 def test_load_negative_coordinates(mock_hotpdf_bank_file_name):
     QUERY = "HOTPDF BANK"
     with patch(
-        "hotpdf.processor.generate_xml_file",
+        "hotpdf.processor.__generate_xml_file",
         return_value=xml_copy_file_name("tests/resources/xml/hotpdf_bank_negative_coords.xml"),
     ):
         hot_pdf_object = HotPdf()
@@ -122,7 +122,7 @@ def test_load_negative_coordinates(mock_hotpdf_bank_file_name):
         assert not hot_pdf_object.find_text(QUERY)[0], "Expected string to be empty"
     # For sanity: The following file is same as above, except the coords are normal
     with patch(
-        "hotpdf.processor.generate_xml_file",
+        "hotpdf.processor.__generate_xml_file",
         return_value=xml_copy_file_name("tests/resources/xml/hotpdf_bank_normal_coords.xml"),
     ):
         hot_pdf_object_normal = HotPdf()

From 3a1ddd3697915c7734b6ec4e62811eca6c404848 Mon Sep 17 00:00:00 2001
From: Mattia Callegari <mattia.callegari@prestatech.com>
Date: Wed, 24 Jan 2024 16:57:28 +0100
Subject: [PATCH 3/5] test: :test_tube: add constructor loading test

---
 .github/workflows/test.yml | 2 +-
 hotpdf/processor.py        | 5 +----
 pyproject.toml             | 2 +-
 tests/test_benchmark.py    | 5 +++--
 tests/test_load.py         | 5 +++++
 5 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 9392a70..5de8006 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -64,7 +64,7 @@ jobs:
         run: pip install -e '.[dev]'
       - name: Run tests with coverage
         run:
-          python -m pytest --cov-fail-under=98
+          python -m pytest --cov-fail-under=98 -n=auto
       - name: Upload coverage to coveralls
         if: github.event_name == 'push'
         uses: coverallsapp/github-action@v2.2.3
diff --git a/hotpdf/processor.py b/hotpdf/processor.py
index 99d9f9f..b93303d 100644
--- a/hotpdf/processor.py
+++ b/hotpdf/processor.py
@@ -65,10 +65,7 @@ def __call_ghostscript(
 
     # Uses gs in pipe mode
     pipe = type(source) is bytes
-    if pipe:
-        command_line_args.append("-")
-    else:
-        command_line_args.append(str(source))
+    command_line_args.append("-" if pipe else str(source))
 
     gs_call = " ".join(command_line_args)
 
diff --git a/pyproject.toml b/pyproject.toml
index 317826c..0602255 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -89,4 +89,4 @@ strict = true
 [tool.pytest.ini_options]
 log_cli=true
 log_level="NOTSET"
-addopts="-n=auto --cov --cov-report term-missing"
+addopts="--cov --cov-report term-missing"
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
index b42cc0b..308ca65 100644
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -26,7 +26,8 @@ def perform_speed_test(file_name, expected_processing_seconds):
     hot_pdf_object = HotPdf()
     hot_pdf_object.load(file_name)
     end_time = time.time()
-    assert (end_time - start_time) < expected_processing_seconds, "Benchmark time exceeded!"
+    elapsed = end_time - start_time
+    assert (elapsed) < expected_processing_seconds, "Benchmark time exceeded!"
 
 
 def perform_memory_test(file_name, expected_peak_memory):
@@ -39,7 +40,7 @@ def perform_memory_test(file_name, expected_peak_memory):
 
 
 def test_speed_benchmark_multiple_pages(multiple_pages_file_name):
-    perform_speed_test(multiple_pages_file_name, 4)
+    perform_speed_test(multiple_pages_file_name, 3)
 
 
 def test_memory_benchmark_multiple_pages(multiple_pages_file_name):
diff --git a/tests/test_load.py b/tests/test_load.py
index 04d1e1f..f3d4207 100644
--- a/tests/test_load.py
+++ b/tests/test_load.py
@@ -39,6 +39,11 @@ def test_load(valid_file_name):
     hot_pdf_object.load(valid_file_name)
 
 
+def test_load_constructor(valid_file_name):
+    hotpdf_obj = HotPdf(source=valid_file_name)
+    assert len(hotpdf_obj.pages) > 0
+
+
 def test_load_bytes(valid_file_name):
     with open(valid_file_name, "rb") as f:
         hot_pdf_object = HotPdf()

From cc4bec44aaaa4988cb7926100d7573f5f6e0fcc5 Mon Sep 17 00:00:00 2001
From: Mattia Callegari <mattia.callegari@prestatech.com>
Date: Wed, 24 Jan 2024 17:55:25 +0100
Subject: [PATCH 4/5] refactor: :construction: renaming variables

---
 hotpdf/hotpdf.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/hotpdf/hotpdf.py b/hotpdf/hotpdf.py
index 56af770..7061e8e 100644
--- a/hotpdf/hotpdf.py
+++ b/hotpdf/hotpdf.py
@@ -14,7 +14,7 @@ class HotPdf:
     def __init__(
         self,
         extraction_tolerance: int = 4,
-        source: Union[str, bytes, None] = None,
+        pdf_file: Union[str, bytes, None] = None,
         password: str = "",
         drop_duplicate_spans: bool = True,
         first_page: int = 0,
@@ -39,8 +39,8 @@ def __init__(
         """
         self.pages: list[MemoryMap] = []
         self.extraction_tolerance: int = extraction_tolerance
-        if source:
-            self.load(source, password, drop_duplicate_spans, first_page, last_page)
+        if pdf_file:
+            self.load(pdf_file, password, drop_duplicate_spans, first_page, last_page)
 
     def __check_file_exists(self, pdf_file: str) -> None:
         if not os.path.exists(pdf_file):
@@ -58,14 +58,14 @@ def __check_page_range(self, first_page: int, last_page: int) -> None:
         if first_page > last_page or first_page < 0 or last_page < 0:
             raise ValueError("Invalid page range")
 
-    def __prechecks(self, source: Union[str, bytes], first_page: int, last_page: int) -> None:
-        if type(source) is str:
-            self.__check_file_exists(source)
+    def __prechecks(self, pdf_file: Union[str, bytes], first_page: int, last_page: int) -> None:
+        if type(pdf_file) is str:
+            self.__check_file_exists(pdf_file)
         self.__check_page_range(first_page, last_page)
 
     def load(
         self,
-        source: Union[str, bytes],
+        pdf_file: Union[str, bytes],
         password: str = "",
         drop_duplicate_spans: bool = True,
         first_page: int = 0,
@@ -86,8 +86,8 @@ def load(
             PermissionError: If the file is encrypted or the password is wrong.
             RuntimeError: If an unkown error is generated by Ghostscript.
         """
-        self.__prechecks(source, first_page, last_page)
-        self.pages = processor.process(source, password, drop_duplicate_spans, first_page, last_page)
+        self.__prechecks(pdf_file, first_page, last_page)
+        self.pages = processor.process(pdf_file, password, drop_duplicate_spans, first_page, last_page)
 
     def __extract_full_text_span(
         self,

From 9af34cab29ef70dc9c5c2355c061fc0e7fda458b Mon Sep 17 00:00:00 2001
From: Mattia Callegari <mattia.callegari@prestatech.com>
Date: Wed, 24 Jan 2024 18:03:57 +0100
Subject: [PATCH 5/5] fix: :boom: change args order for easier use, test

---
 hotpdf/hotpdf.py   | 6 +++---
 tests/test_load.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/hotpdf/hotpdf.py b/hotpdf/hotpdf.py
index 7061e8e..90e1c17 100644
--- a/hotpdf/hotpdf.py
+++ b/hotpdf/hotpdf.py
@@ -13,23 +13,23 @@
 class HotPdf:
     def __init__(
         self,
-        extraction_tolerance: int = 4,
         pdf_file: Union[str, bytes, None] = None,
         password: str = "",
         drop_duplicate_spans: bool = True,
         first_page: int = 0,
         last_page: int = 0,
+        extraction_tolerance: int = 4,
     ) -> None:
         """Initialize the HotPdf class.
 
         Args:
-            extraction_tolerance (int, optional): Tolerance value used during text extraction
-                to adjust the bounding box for capturing text. Defaults to 4.
             pdf_file (str | bytes, optional): The path to the PDF file to be loaded, or a bytes object.
             password (str, optional): Password to use to unlock the pdf
             drop_duplicate_spans (bool, optional): Drop duplicate spans when loading. Defaults to True.
             first_page (int, optional): The first page to load. Defaults to 0.
             last_page (int, optional): The last page to load. Defaults to 0.
+            extraction_tolerance (int, optional): Tolerance value used during text extraction
+                to adjust the bounding box for capturing text. Defaults to 4.
 
         Raises:
             ValueError: If the page range is invalid.
diff --git a/tests/test_load.py b/tests/test_load.py
index f3d4207..4a6f707 100644
--- a/tests/test_load.py
+++ b/tests/test_load.py
@@ -40,7 +40,7 @@ def test_load(valid_file_name):
 
 
 def test_load_constructor(valid_file_name):
-    hotpdf_obj = HotPdf(source=valid_file_name)
+    hotpdf_obj = HotPdf(valid_file_name)
     assert len(hotpdf_obj.pages) > 0