From 26f6a1d76a736c690a4d1b57fce32a3faec6742d Mon Sep 17 00:00:00 2001 From: Arkajit Datta <61142632+Arkajit-Datta@users.noreply.github.com> Date: Wed, 6 Sep 2023 16:32:26 +0530 Subject: [PATCH] PDF and DOCX support in Write File - Feature Improvement, close #548 (#1125) Co-authored-by: Fluder-Paradyne <121793617+Fluder-Paradyne@users.noreply.github.com> Co-authored-by: Abhijeet <129729795+luciferlinx101@users.noreply.github.com> --- Dockerfile | 4 +- DockerfileCelery | 2 + requirements.txt | 3 + superagi/exceptions/__init__.py | 0 superagi/exceptions/file_exceptions.py | 10 ++ superagi/resource_manager/file_manager.py | 104 +++++++++++--- .../tools/file/prompts/add_images_to_html.txt | 4 + .../file/prompts/content_to_html_prompt.txt | 7 + superagi/tools/file/write_file.py | 130 ++++++++++++++++-- .../resource_manager/test_file_manager.py | 32 ++++- 10 files changed, 266 insertions(+), 30 deletions(-) create mode 100644 superagi/exceptions/__init__.py create mode 100644 superagi/exceptions/file_exceptions.py create mode 100644 superagi/tools/file/prompts/add_images_to_html.txt create mode 100644 superagi/tools/file/prompts/content_to_html_prompt.txt diff --git a/Dockerfile b/Dockerfile index 420f68cad..4d9c01dde 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,7 +3,7 @@ FROM python:3.10-slim-bullseye AS compile-image WORKDIR /app RUN apt-get update && \ - apt-get install --no-install-recommends -y wget libpq-dev gcc g++ python3-dev && \ + apt-get install --no-install-recommends -y wget libpq-dev gcc g++ python3-dev wkhtmltopdf && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* @@ -24,7 +24,7 @@ FROM python:3.10-slim-bullseye AS build-image WORKDIR /app RUN apt-get update && \ - apt-get install --no-install-recommends -y libpq-dev && \ + apt-get install --no-install-recommends -y libpq-dev wkhtmltopdf && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* diff --git a/DockerfileCelery b/DockerfileCelery index 682e50824..ba608dcbd 100644 --- a/DockerfileCelery +++ b/DockerfileCelery @@ -3,6 +3,8 @@ FROM python:3.9 WORKDIR /app #RUN apt-get update && apt-get install --no-install-recommends -y git wget libpq-dev gcc python3-dev && pip install psycopg2 +RUN apt-get update && apt-get install -y wkhtmltopdf + RUN pip install --upgrade pip COPY requirements.txt . diff --git a/requirements.txt b/requirements.txt index ab45bb1c7..b5102bb18 100644 --- a/requirements.txt +++ b/requirements.txt @@ -156,5 +156,8 @@ html2text==2020.1.16 duckduckgo-search==3.8.3 google-generativeai==0.1.0 unstructured==0.8.1 +beautifulsoup4==4.12.2 +pdfkit==1.0.0 +htmldocx==0.0.6 ai21==1.2.6 typing-extensions==4.5.0 diff --git a/superagi/exceptions/__init__.py b/superagi/exceptions/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/superagi/exceptions/file_exceptions.py b/superagi/exceptions/file_exceptions.py new file mode 100644 index 000000000..354135196 --- /dev/null +++ b/superagi/exceptions/file_exceptions.py @@ -0,0 +1,10 @@ + +class UnsupportedFileTypeError(Exception): + def __init__(self, file_name: str, supported_types: list): + message = f"Unsupported file type for '{file_name}'. Supported types are: {', '.join(supported_types)}" + super().__init__(message) + +class FileNotCreatedError(Exception): + def __init__(self, file_name: str): + message = f"Failed to create the file '{file_name}'." + super().__init__(message) \ No newline at end of file diff --git a/superagi/resource_manager/file_manager.py b/superagi/resource_manager/file_manager.py index 4c20ba16d..45ed90d7b 100644 --- a/superagi/resource_manager/file_manager.py +++ b/superagi/resource_manager/file_manager.py @@ -1,18 +1,25 @@ import csv from sqlalchemy.orm import Session -from superagi.config.config import get_config import os + +from superagi.config.config import get_config from superagi.helper.resource_helper import ResourceHelper from superagi.helper.s3_helper import S3Helper from superagi.lib.logger import logger from superagi.models.agent import Agent from superagi.models.agent_execution import AgentExecution from superagi.types.storage_types import StorageType +from superagi.exceptions.file_exceptions import UnsupportedFileTypeError, FileNotCreatedError + +import pdfkit +from htmldocx import HtmlToDocx + class FileManager: def __init__(self, session: Session, agent_id: int = None, agent_execution_id: int = None): self.session = session self.agent_id = agent_id self.agent_execution_id = agent_execution_id + def write_binary_file(self, file_name: str, data): if self.agent_id is not None: final_path = ResourceHelper.get_agent_write_resource_path(file_name, @@ -32,6 +39,7 @@ def write_binary_file(self, file_name: str, data): return f"Binary {file_name} saved successfully" except Exception as err: return f"Error write_binary_file: {err}" + def write_to_s3(self, file_name, final_path): with open(final_path, 'rb') as img: resource = ResourceHelper.make_written_file_resource(file_name=file_name, @@ -55,25 +63,16 @@ def write_file(self, file_name: str, content): self.agent_execution_id)) else: final_path = ResourceHelper.get_resource_path(file_name) + try: - with open(final_path, mode="w") as file: - file.write(content) - file.close() - self.write_to_s3(file_name, final_path) - logger.info(f"{file_name} - File written successfully") - return f"{file_name} - File written successfully" + self.save_file_by_type(file_name=file_name, file_path=final_path, content=content) except Exception as err: return f"Error write_file: {err}" - def write_csv_file(self, file_name: str, csv_data): - if self.agent_id is not None: - final_path = ResourceHelper.get_agent_write_resource_path(file_name, - agent=Agent.get_agent_from_id(self.session, - self.agent_id), - agent_execution=AgentExecution - .get_agent_execution_from_id(self.session, - self.agent_execution_id)) - else: - final_path = ResourceHelper.get_resource_path(file_name) + + logger.info(f"{file_name} - File written successfully") + return f"{file_name} - File written successfully" + + def write_csv_file(self, file_name: str, final_path: str, csv_data) -> str: try: with open(final_path, mode="w", newline="") as file: writer = csv.writer(file, lineterminator="\n") @@ -82,15 +81,63 @@ def write_csv_file(self, file_name: str, csv_data): logger.info(f"{file_name} - File written successfully") return f"{file_name} - File written successfully" except Exception as err: - return f"Error write_csv_file: {err}" + raise FileNotCreatedError(file_name=file_name) from err + + def write_pdf_file(self, file_name: str ,file_path: str, content): + # Saving the HTML file + html_file_path = f"{file_path[:-4]}.html" + self.write_txt_file(file_name=html_file_path.split('/')[-1], file_path=html_file_path, content=content) + + # Convert HTML file to a PDF file + try: + options = { + 'quiet': '', + 'page-size': 'Letter', + 'margin-top': '0.75in', + 'margin-right': '0.75in', + 'margin-bottom': '0.75in', + 'margin-left': '0.75in', + 'enable-local-file-access': '' + } + config = pdfkit.configuration(wkhtmltopdf = "/usr/bin/wkhtmltopdf") + pdfkit.from_file(html_file_path, file_path, options = options, configuration = config) + self.write_to_s3(file_name, file_path) + return file_path + except Exception as err: + raise FileNotCreatedError(file_name=file_name) from err + + def write_docx_file(self, file_name: str ,file_path: str, content): + # Saving the HTML file + html_file_path = f"{file_path[:-4]}.html" + self.write_txt_file(file_name=html_file_path.split('/')[-1], file_path=html_file_path, content=content) + # Convert HTML file to a DOCx file + try: + new_parser = HtmlToDocx() + new_parser.parse_html_file(html_file_path, file_path) + self.write_to_s3(file_name, file_path) + return file_path + except Exception as err: + raise FileNotCreatedError(file_name=file_name) from err + + def write_txt_file(self, file_name: str ,file_path: str, content) -> str: + try: + with open(file_path, mode="w") as file: + file.write(content) + file.close() + self.write_to_s3(file_name, file_path) + return file_path + except Exception as err: + raise FileNotCreatedError(file_name=file_name) from err + def get_agent_resource_path(self, file_name: str): return ResourceHelper.get_agent_write_resource_path(file_name, agent=Agent.get_agent_from_id(self.session, self.agent_id), agent_execution=AgentExecution .get_agent_execution_from_id(self.session, self.agent_execution_id)) + def read_file(self, file_name: str): if self.agent_id is not None: final_path = self.get_agent_resource_path(file_name) @@ -104,6 +151,7 @@ def read_file(self, file_name: str): return content except Exception as err: return f"Error while reading file {file_name}: {err}" + def get_files(self): """ Gets all file names generated by the CodingTool. @@ -122,3 +170,23 @@ def get_files(self): logger.error(f"Error while accessing files in {final_path}: {err}") files = [] return files + + def save_file_by_type(self, file_name: str, file_path: str, content): + + # Extract the file type from the file_name + file_type = file_name.split('.')[-1].lower() + + # Dictionary to map file types to corresponding functions + file_type_handlers = { + 'txt': self.write_txt_file, + 'pdf': self.write_pdf_file, + 'docx': self.write_docx_file, + 'doc': self.write_docx_file, + 'csv': self.write_csv_file, + 'html': self.write_txt_file + # NOTE: Add more file types and corresponding functions as needed, These functions should be defined + } + + if file_type not in file_type_handlers: + raise UnsupportedFileTypeError(file_name=file_name, supported_types=list(file_type_handlers)) + \ No newline at end of file diff --git a/superagi/tools/file/prompts/add_images_to_html.txt b/superagi/tools/file/prompts/add_images_to_html.txt new file mode 100644 index 000000000..57dda0a49 --- /dev/null +++ b/superagi/tools/file/prompts/add_images_to_html.txt @@ -0,0 +1,4 @@ +Now, you will be provided with few image path locations. You will have to attach the following images in appropriate locations inside the html code. +Remember to maintain the elegancy and styling of the User Interface generated. Make sure you attach all the images provided to you. + +The relevant paths of the images are provided below: diff --git a/superagi/tools/file/prompts/content_to_html_prompt.txt b/superagi/tools/file/prompts/content_to_html_prompt.txt new file mode 100644 index 000000000..e450f46e4 --- /dev/null +++ b/superagi/tools/file/prompts/content_to_html_prompt.txt @@ -0,0 +1,7 @@ +You are an HTML code generating AI Agent. Your task is to generate a well formatted and well styled HTML file for a given content. +Remember to style the HTML beautifully, for which you can add the