From 874b91eacad3292286db1f4b9845b63d424378dc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <rafal@milecki.pl>
Date: Sun, 12 Mar 2023 11:25:00 +0100
Subject: [PATCH] Refactor to_text() to return string instead of bytes

Function name "to_text" suggests it should return a text. Python uses
str type for storing texts. Having that function return bytes was
counter-intuitive.

This also simplifies code as most input methods deal with str type. With
this change there is no need to encode str into bytes and decode it
back.
There are actually only 2 input methods dealing with bytes: "pdftotext"
and "tesseract". Make them decode bytes into str before returning from
to_text().
---
 src/invoice2data/extract/invoice_template.py | 2 +-
 src/invoice2data/input/gvision.py            | 2 +-
 src/invoice2data/input/pdfminer_wrapper.py   | 2 +-
 src/invoice2data/input/pdfplumber.py         | 2 +-
 src/invoice2data/input/pdftotext.py          | 2 +-
 src/invoice2data/input/tesseract.py          | 2 +-
 src/invoice2data/input/text.py               | 2 +-
 src/invoice2data/main.py                     | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/invoice2data/extract/invoice_template.py b/src/invoice2data/extract/invoice_template.py
index 6b43dd28..e381f08e 100644
--- a/src/invoice2data/extract/invoice_template.py
+++ b/src/invoice2data/extract/invoice_template.py
@@ -206,7 +206,7 @@ def extract(self, optimized_str: str, invoice_file: str, input_module: str) -> O
                     logger.debug(f"Area was specified with parameters {v['area']}")
                     # Extract the text for the specified area
                     # Do NOT overwrite optimized_str. We're inside a loop and it will affect all other fields!
-                    optimized_str_area = input_module.to_text(invoice_file, v['area']).decode("utf-8")
+                    optimized_str_area = input_module.to_text(invoice_file, v['area'])
                     # Log the text
                     logger.debug("START pdftotext area result ===========================\n%s", optimized_str_area)
                     logger.debug("END pdftotext area result =============================")
diff --git a/src/invoice2data/input/gvision.py b/src/invoice2data/input/gvision.py
index 4deabe95..a60b9d3f 100644
--- a/src/invoice2data/input/gvision.py
+++ b/src/invoice2data/input/gvision.py
@@ -84,4 +84,4 @@ def to_text(path, bucket_name="cloud-vision-84893", language="en"):
     first_page_response = response.responses[0]
     annotation = first_page_response.full_text_annotation
 
-    return annotation.text.encode("utf-8")
+    return annotation.text
diff --git a/src/invoice2data/input/pdfminer_wrapper.py b/src/invoice2data/input/pdfminer_wrapper.py
index 98134fa4..8fafb359 100644
--- a/src/invoice2data/input/pdfminer_wrapper.py
+++ b/src/invoice2data/input/pdfminer_wrapper.py
@@ -53,4 +53,4 @@ def to_text(path):
     device.close()
     out = retstr.getvalue()
     retstr.close()
-    return out.encode("utf-8")
+    return out
diff --git a/src/invoice2data/input/pdfplumber.py b/src/invoice2data/input/pdfplumber.py
index aa8a472b..a2e53500 100644
--- a/src/invoice2data/input/pdfplumber.py
+++ b/src/invoice2data/input/pdfplumber.py
@@ -38,7 +38,7 @@ def to_text(path):
     logger.debug("Text extraction made with pdfplumber")
 
     raw_text = res_to_raw_text(res)
-    return raw_text.encode("utf-8")
+    return raw_text
 
 
 def res_to_raw_text(res):
diff --git a/src/invoice2data/input/pdftotext.py b/src/invoice2data/input/pdftotext.py
index 76f56092..e3cef464 100644
--- a/src/invoice2data/input/pdftotext.py
+++ b/src/invoice2data/input/pdftotext.py
@@ -50,7 +50,7 @@ def to_text(path: str, area_details: dict = None):
         cmd += [path, "-"]
         # Run the extraction
         out, err = subprocess.Popen(cmd, stdout=subprocess.PIPE).communicate()
-        return out
+        return out.decode('utf-8')
     else:
         raise EnvironmentError(
             "pdftotext not installed. Can be downloaded from https://poppler.freedesktop.org/"
diff --git a/src/invoice2data/input/tesseract.py b/src/invoice2data/input/tesseract.py
index 56ebd20b..57cd70a8 100644
--- a/src/invoice2data/input/tesseract.py
+++ b/src/invoice2data/input/tesseract.py
@@ -142,7 +142,7 @@ def to_text(path: str, area_details: dict = None):
     except TimeoutExpired:
         p3.kill()
         logger.warning("pdftotext took too long - skipping")
-    return extracted_str
+    return extracted_str.decode('utf-8')
 
 
 def get_languages():
diff --git a/src/invoice2data/input/text.py b/src/invoice2data/input/text.py
index 8db75be6..a88f27b4 100644
--- a/src/invoice2data/input/text.py
+++ b/src/invoice2data/input/text.py
@@ -2,4 +2,4 @@
 
 def to_text(path):
     with open(path, 'r') as f:
-        return f.read().encode('utf-8')
+        return f.read()
diff --git a/src/invoice2data/main.py b/src/invoice2data/main.py
index d0858967..cea046ce 100644
--- a/src/invoice2data/main.py
+++ b/src/invoice2data/main.py
@@ -86,7 +86,7 @@ def extract_data(invoicefile, templates=None, input_module=None):
         else:
             input_module = pdftotext
 
-    extracted_str = input_module.to_text(invoicefile).decode("utf-8")
+    extracted_str = input_module.to_text(invoicefile)
     if not isinstance(extracted_str, str) or not extracted_str.strip():
         logger.error("Failed to extract text from %s using %s", invoicefile, input_module.__name__)
         return False