From b14bad5fb9448c7e16aad75df5ad455ca5b65d21 Mon Sep 17 00:00:00 2001
From: github-actions <github-actions@github.com>
Date: Sun, 17 Nov 2024 22:58:53 -0500
Subject: [PATCH] update to match new extractous API changes

---
 bbot/modules/extractous.py | 59 +++++++++++---------------------------
 1 file changed, 17 insertions(+), 42 deletions(-)

diff --git a/bbot/modules/extractous.py b/bbot/modules/extractous.py
index 69b98612c..471e2c07e 100644
--- a/bbot/modules/extractous.py
+++ b/bbot/modules/extractous.py
@@ -81,6 +81,12 @@ async def filter_event(self, event):
     async def handle_event(self, event):
         file_path = event.data["path"]
         content = await self.scan.helpers.run_in_executor_mp(extract_text, file_path)
+        if isinstance(content, tuple):
+            error, traceback = content
+            self.error(f"Error extracting text from {file_path}: {error}")
+            self.trace(traceback)
+            return
+
         if content:
             raw_text_event = self.make_event(
                 content,
@@ -99,49 +105,18 @@ def extract_text(file_path):
     :return: ASCII-encoded plaintext extracted from the document.
     """
 
-    extractable_file_types = [
-        ".csv",
-        ".eml",
-        ".msg",
-        ".epub",
-        ".xlsx",
-        ".xls",
-        ".html",
-        ".htm",
-        ".md",
-        ".org",
-        ".odt",
-        ".pdf",
-        ".txt",
-        ".text",
-        ".log",
-        ".ppt",
-        ".pptx",
-        ".rst",
-        ".rtf",
-        ".tsv",
-        ".doc",
-        ".docx",
-        ".xml",
-    ]
-
-    # If the file can be extracted with extractous use its partition function or try and read it
-    if any(file_path.lower().endswith(file_type) for file_type in extractable_file_types):
-        try:
-            extractor = Extractor()
-            reader = extractor.extract_file(str(file_path))
+    try:
+        extractor = Extractor()
+        reader, metadata = extractor.extract_file(str(file_path))
 
-            result = ""
+        result = ""
+        buffer = reader.read(4096)
+        while len(buffer) > 0:
+            result += buffer.decode("utf-8")
             buffer = reader.read(4096)
-            while len(buffer) > 0:
-                result += buffer.decode("utf-8")
-                buffer = reader.read(4096)
 
-            return result.strip()
+        return result.strip()
+    except Exception as e:
+        import traceback
 
-        except Exception:
-            with open(file_path, "rb") as file:
-                return file.read().decode("utf-8", errors="ignore")
-    else:
-        with open(file_path, "rb") as file:
-            return file.read().decode("utf-8", errors="ignore")
+        return (str(e), traceback.format_exc())