From b14bad5fb9448c7e16aad75df5ad455ca5b65d21 Mon Sep 17 00:00:00 2001 From: github-actions Date: Sun, 17 Nov 2024 22:58:53 -0500 Subject: [PATCH] update to match new extractous API changes --- bbot/modules/extractous.py | 59 +++++++++++--------------------------- 1 file changed, 17 insertions(+), 42 deletions(-) diff --git a/bbot/modules/extractous.py b/bbot/modules/extractous.py index 69b98612c..471e2c07e 100644 --- a/bbot/modules/extractous.py +++ b/bbot/modules/extractous.py @@ -81,6 +81,12 @@ async def filter_event(self, event): async def handle_event(self, event): file_path = event.data["path"] content = await self.scan.helpers.run_in_executor_mp(extract_text, file_path) + if isinstance(content, tuple): + error, traceback = content + self.error(f"Error extracting text from {file_path}: {error}") + self.trace(traceback) + return + if content: raw_text_event = self.make_event( content, @@ -99,49 +105,18 @@ def extract_text(file_path): :return: ASCII-encoded plaintext extracted from the document. """ - extractable_file_types = [ - ".csv", - ".eml", - ".msg", - ".epub", - ".xlsx", - ".xls", - ".html", - ".htm", - ".md", - ".org", - ".odt", - ".pdf", - ".txt", - ".text", - ".log", - ".ppt", - ".pptx", - ".rst", - ".rtf", - ".tsv", - ".doc", - ".docx", - ".xml", - ] - - # If the file can be extracted with extractous use its partition function or try and read it - if any(file_path.lower().endswith(file_type) for file_type in extractable_file_types): - try: - extractor = Extractor() - reader = extractor.extract_file(str(file_path)) + try: + extractor = Extractor() + reader, metadata = extractor.extract_file(str(file_path)) - result = "" + result = "" + buffer = reader.read(4096) + while len(buffer) > 0: + result += buffer.decode("utf-8") buffer = reader.read(4096) - while len(buffer) > 0: - result += buffer.decode("utf-8") - buffer = reader.read(4096) - return result.strip() + return result.strip() + except Exception as e: + import traceback - except Exception: - with open(file_path, "rb") as file: - return file.read().decode("utf-8", errors="ignore") - else: - with open(file_path, "rb") as file: - return file.read().decode("utf-8", errors="ignore") + return (str(e), traceback.format_exc())