Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor to_text() to return string instead of bytes #493

Merged
merged 1 commit into from
Mar 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/invoice2data/extract/invoice_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ def extract(self, optimized_str: str, invoice_file: str, input_module: str) -> O
logger.debug(f"Area was specified with parameters {v['area']}")
# Extract the text for the specified area
# Do NOT overwrite optimized_str. We're inside a loop and it will affect all other fields!
optimized_str_area = input_module.to_text(invoice_file, v['area']).decode("utf-8")
optimized_str_area = input_module.to_text(invoice_file, v['area'])
# Log the text
logger.debug("START pdftotext area result ===========================\n%s", optimized_str_area)
logger.debug("END pdftotext area result =============================")
Expand Down
2 changes: 1 addition & 1 deletion src/invoice2data/input/gvision.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,4 +84,4 @@ def to_text(path, bucket_name="cloud-vision-84893", language="en"):
first_page_response = response.responses[0]
annotation = first_page_response.full_text_annotation

return annotation.text.encode("utf-8")
return annotation.text
2 changes: 1 addition & 1 deletion src/invoice2data/input/pdfminer_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,4 +53,4 @@ def to_text(path):
device.close()
out = retstr.getvalue()
retstr.close()
return out.encode("utf-8")
return out
2 changes: 1 addition & 1 deletion src/invoice2data/input/pdfplumber.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def to_text(path):
logger.debug("Text extraction made with pdfplumber")

raw_text = res_to_raw_text(res)
return raw_text.encode("utf-8")
return raw_text


def res_to_raw_text(res):
Expand Down
2 changes: 1 addition & 1 deletion src/invoice2data/input/pdftotext.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def to_text(path: str, area_details: dict = None):
cmd += [path, "-"]
# Run the extraction
out, err = subprocess.Popen(cmd, stdout=subprocess.PIPE).communicate()
return out
return out.decode('utf-8')
else:
raise EnvironmentError(
"pdftotext not installed. Can be downloaded from https://poppler.freedesktop.org/"
Expand Down
2 changes: 1 addition & 1 deletion src/invoice2data/input/tesseract.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def to_text(path: str, area_details: dict = None):
except TimeoutExpired:
p3.kill()
logger.warning("pdftotext took too long - skipping")
return extracted_str
return extracted_str.decode('utf-8')


def get_languages():
Expand Down
2 changes: 1 addition & 1 deletion src/invoice2data/input/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@

def to_text(path):
with open(path, 'r') as f:
return f.read().encode('utf-8')
return f.read()
2 changes: 1 addition & 1 deletion src/invoice2data/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def extract_data(invoicefile, templates=None, input_module=None):
else:
input_module = pdftotext

extracted_str = input_module.to_text(invoicefile).decode("utf-8")
extracted_str = input_module.to_text(invoicefile)
if not isinstance(extracted_str, str) or not extracted_str.strip():
logger.error("Failed to extract text from %s using %s", invoicefile, input_module.__name__)
return False
Expand Down