Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update Extractous with new API changes #1976

Merged
merged 1 commit into from
Nov 18, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 17 additions & 42 deletions bbot/modules/extractous.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,12 @@ async def filter_event(self, event):
async def handle_event(self, event):
file_path = event.data["path"]
content = await self.scan.helpers.run_in_executor_mp(extract_text, file_path)
if isinstance(content, tuple):
error, traceback = content
self.error(f"Error extracting text from {file_path}: {error}")
self.trace(traceback)
return

if content:
raw_text_event = self.make_event(
content,
Expand All @@ -99,49 +105,18 @@ def extract_text(file_path):
:return: ASCII-encoded plaintext extracted from the document.
"""

extractable_file_types = [
".csv",
".eml",
".msg",
".epub",
".xlsx",
".xls",
".html",
".htm",
".md",
".org",
".odt",
".pdf",
".txt",
".text",
".log",
".ppt",
".pptx",
".rst",
".rtf",
".tsv",
".doc",
".docx",
".xml",
]

# If the file can be extracted with extractous use its partition function or try and read it
if any(file_path.lower().endswith(file_type) for file_type in extractable_file_types):
try:
extractor = Extractor()
reader = extractor.extract_file(str(file_path))
try:
extractor = Extractor()
reader, metadata = extractor.extract_file(str(file_path))

result = ""
result = ""
buffer = reader.read(4096)
while len(buffer) > 0:
result += buffer.decode("utf-8")
buffer = reader.read(4096)
while len(buffer) > 0:
result += buffer.decode("utf-8")
buffer = reader.read(4096)

return result.strip()
return result.strip()
except Exception as e:
import traceback

except Exception:
with open(file_path, "rb") as file:
return file.read().decode("utf-8", errors="ignore")
else:
with open(file_path, "rb") as file:
return file.read().decode("utf-8", errors="ignore")
return (str(e), traceback.format_exc())
Loading