From 874b91eacad3292286db1f4b9845b63d424378dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Sun, 12 Mar 2023 11:25:00 +0100 Subject: [PATCH] Refactor to_text() to return string instead of bytes Function name "to_text" suggests it should return a text. Python uses str type for storing texts. Having that function return bytes was counter-intuitive. This also simplifies code as most input methods deal with str type. With this change there is no need to encode str into bytes and decode it back. There are actually only 2 input methods dealing with bytes: "pdftotext" and "tesseract". Make them decode bytes into str before returning from to_text(). --- src/invoice2data/extract/invoice_template.py | 2 +- src/invoice2data/input/gvision.py | 2 +- src/invoice2data/input/pdfminer_wrapper.py | 2 +- src/invoice2data/input/pdfplumber.py | 2 +- src/invoice2data/input/pdftotext.py | 2 +- src/invoice2data/input/tesseract.py | 2 +- src/invoice2data/input/text.py | 2 +- src/invoice2data/main.py | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/invoice2data/extract/invoice_template.py b/src/invoice2data/extract/invoice_template.py index 6b43dd28..e381f08e 100644 --- a/src/invoice2data/extract/invoice_template.py +++ b/src/invoice2data/extract/invoice_template.py @@ -206,7 +206,7 @@ def extract(self, optimized_str: str, invoice_file: str, input_module: str) -> O logger.debug(f"Area was specified with parameters {v['area']}") # Extract the text for the specified area # Do NOT overwrite optimized_str. We're inside a loop and it will affect all other fields! - optimized_str_area = input_module.to_text(invoice_file, v['area']).decode("utf-8") + optimized_str_area = input_module.to_text(invoice_file, v['area']) # Log the text logger.debug("START pdftotext area result ===========================\n%s", optimized_str_area) logger.debug("END pdftotext area result =============================") diff --git a/src/invoice2data/input/gvision.py b/src/invoice2data/input/gvision.py index 4deabe95..a60b9d3f 100644 --- a/src/invoice2data/input/gvision.py +++ b/src/invoice2data/input/gvision.py @@ -84,4 +84,4 @@ def to_text(path, bucket_name="cloud-vision-84893", language="en"): first_page_response = response.responses[0] annotation = first_page_response.full_text_annotation - return annotation.text.encode("utf-8") + return annotation.text diff --git a/src/invoice2data/input/pdfminer_wrapper.py b/src/invoice2data/input/pdfminer_wrapper.py index 98134fa4..8fafb359 100644 --- a/src/invoice2data/input/pdfminer_wrapper.py +++ b/src/invoice2data/input/pdfminer_wrapper.py @@ -53,4 +53,4 @@ def to_text(path): device.close() out = retstr.getvalue() retstr.close() - return out.encode("utf-8") + return out diff --git a/src/invoice2data/input/pdfplumber.py b/src/invoice2data/input/pdfplumber.py index aa8a472b..a2e53500 100644 --- a/src/invoice2data/input/pdfplumber.py +++ b/src/invoice2data/input/pdfplumber.py @@ -38,7 +38,7 @@ def to_text(path): logger.debug("Text extraction made with pdfplumber") raw_text = res_to_raw_text(res) - return raw_text.encode("utf-8") + return raw_text def res_to_raw_text(res): diff --git a/src/invoice2data/input/pdftotext.py b/src/invoice2data/input/pdftotext.py index 76f56092..e3cef464 100644 --- a/src/invoice2data/input/pdftotext.py +++ b/src/invoice2data/input/pdftotext.py @@ -50,7 +50,7 @@ def to_text(path: str, area_details: dict = None): cmd += [path, "-"] # Run the extraction out, err = subprocess.Popen(cmd, stdout=subprocess.PIPE).communicate() - return out + return out.decode('utf-8') else: raise EnvironmentError( "pdftotext not installed. Can be downloaded from https://poppler.freedesktop.org/" diff --git a/src/invoice2data/input/tesseract.py b/src/invoice2data/input/tesseract.py index 56ebd20b..57cd70a8 100644 --- a/src/invoice2data/input/tesseract.py +++ b/src/invoice2data/input/tesseract.py @@ -142,7 +142,7 @@ def to_text(path: str, area_details: dict = None): except TimeoutExpired: p3.kill() logger.warning("pdftotext took too long - skipping") - return extracted_str + return extracted_str.decode('utf-8') def get_languages(): diff --git a/src/invoice2data/input/text.py b/src/invoice2data/input/text.py index 8db75be6..a88f27b4 100644 --- a/src/invoice2data/input/text.py +++ b/src/invoice2data/input/text.py @@ -2,4 +2,4 @@ def to_text(path): with open(path, 'r') as f: - return f.read().encode('utf-8') + return f.read() diff --git a/src/invoice2data/main.py b/src/invoice2data/main.py index d0858967..cea046ce 100644 --- a/src/invoice2data/main.py +++ b/src/invoice2data/main.py @@ -86,7 +86,7 @@ def extract_data(invoicefile, templates=None, input_module=None): else: input_module = pdftotext - extracted_str = input_module.to_text(invoicefile).decode("utf-8") + extracted_str = input_module.to_text(invoicefile) if not isinstance(extracted_str, str) or not extracted_str.strip(): logger.error("Failed to extract text from %s using %s", invoicefile, input_module.__name__) return False