From 3c0ae3536d3c23836ae5904fb8309d73a85e7971 Mon Sep 17 00:00:00 2001 From: gozineb Date: Tue, 14 Nov 2023 08:44:26 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=9A=9A=20create=20"files"=20package?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/celery_worker.py | 8 ++--- backend/models/files.py | 2 +- .../{parsers => packages/files}/__init__.py | 0 backend/{utils => packages/files}/file.py | 0 backend/packages/files/parsers/__init__.py | 0 backend/{ => packages/files}/parsers/audio.py | 3 +- .../files}/parsers/code_python.py | 0 .../{ => packages/files}/parsers/common.py | 0 backend/{ => packages/files}/parsers/csv.py | 0 backend/{ => packages/files}/parsers/docx.py | 0 backend/{ => packages/files}/parsers/epub.py | 0 .../{ => packages/files}/parsers/github.py | 2 +- backend/{ => packages/files}/parsers/html.py | 0 .../{ => packages/files}/parsers/markdown.py | 0 .../{ => packages/files}/parsers/notebook.py | 0 backend/{ => packages/files}/parsers/odt.py | 0 backend/{ => packages/files}/parsers/pdf.py | 0 .../files}/parsers/powerpoint.py | 0 .../{ => packages/files}/parsers/telegram.py | 0 backend/{ => packages/files}/parsers/txt.py | 0 backend/{ => packages/files}/parsers/xlsx.py | 0 .../{utils => packages/files}/processors.py | 33 ++++++++++--------- backend/routes/crawl_routes.py | 2 +- backend/routes/upload_routes.py | 2 +- 24 files changed, 25 insertions(+), 27 deletions(-) rename backend/{parsers => packages/files}/__init__.py (100%) rename backend/{utils => packages/files}/file.py (100%) create mode 100644 backend/packages/files/parsers/__init__.py rename backend/{ => packages/files}/parsers/audio.py (97%) rename backend/{ => packages/files}/parsers/code_python.py (100%) rename backend/{ => packages/files}/parsers/common.py (100%) rename backend/{ => packages/files}/parsers/csv.py (100%) rename backend/{ => packages/files}/parsers/docx.py (100%) rename backend/{ => packages/files}/parsers/epub.py (100%) rename backend/{ => packages/files}/parsers/github.py (97%) rename backend/{ => packages/files}/parsers/html.py (100%) rename backend/{ => packages/files}/parsers/markdown.py (100%) rename backend/{ => packages/files}/parsers/notebook.py (100%) rename backend/{ => packages/files}/parsers/odt.py (100%) rename backend/{ => packages/files}/parsers/pdf.py (100%) rename backend/{ => packages/files}/parsers/powerpoint.py (100%) rename backend/{ => packages/files}/parsers/telegram.py (100%) rename backend/{ => packages/files}/parsers/txt.py (100%) rename backend/{ => packages/files}/parsers/xlsx.py (100%) rename backend/{utils => packages/files}/processors.py (81%) diff --git a/backend/celery_worker.py b/backend/celery_worker.py index 5ee8564067dd..e5d6a365fc2a 100644 --- a/backend/celery_worker.py +++ b/backend/celery_worker.py @@ -10,15 +10,13 @@ from models.files import File from models.notifications import NotificationsStatusEnum from models.settings import get_supabase_client -from parsers.github import process_github -from repository.brain.update_brain_last_update_time import ( - update_brain_last_update_time, -) +from packages.files.parsers.github import process_github +from packages.files.processors import filter_file +from repository.brain.update_brain_last_update_time import update_brain_last_update_time from repository.notification.update_notification import update_notification_by_id from repository.onboarding.remove_onboarding_more_than_x_days import ( remove_onboarding_more_than_x_days, ) -from utils.processors import filter_file CELERY_BROKER_URL = os.getenv("CELERY_BROKER_URL", "") CELEBRY_BROKER_QUEUE_NAME = os.getenv("CELEBRY_BROKER_QUEUE_NAME", "quivr") diff --git a/backend/models/files.py b/backend/models/files.py index 0bd7e00f8d66..f8a6ad6afc7c 100644 --- a/backend/models/files.py +++ b/backend/models/files.py @@ -9,8 +9,8 @@ from models.brains import Brain from models.databases.supabase.supabase import SupabaseDB from models.settings import get_supabase_db +from packages.files.file import compute_sha1_from_file from pydantic import BaseModel -from utils.file import compute_sha1_from_file logger = get_logger(__name__) diff --git a/backend/parsers/__init__.py b/backend/packages/files/__init__.py similarity index 100% rename from backend/parsers/__init__.py rename to backend/packages/files/__init__.py diff --git a/backend/utils/file.py b/backend/packages/files/file.py similarity index 100% rename from backend/utils/file.py rename to backend/packages/files/file.py diff --git a/backend/packages/files/parsers/__init__.py b/backend/packages/files/parsers/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/backend/parsers/audio.py b/backend/packages/files/parsers/audio.py similarity index 97% rename from backend/parsers/audio.py rename to backend/packages/files/parsers/audio.py index b4a798117299..f715c55b0de8 100644 --- a/backend/parsers/audio.py +++ b/backend/packages/files/parsers/audio.py @@ -5,9 +5,8 @@ import openai from langchain.schema import Document from langchain.text_splitter import RecursiveCharacterTextSplitter - from models import File, get_documents_vector_store -from utils.file import compute_sha1_from_content +from packages.files.file import compute_sha1_from_content async def process_audio( diff --git a/backend/parsers/code_python.py b/backend/packages/files/parsers/code_python.py similarity index 100% rename from backend/parsers/code_python.py rename to backend/packages/files/parsers/code_python.py diff --git a/backend/parsers/common.py b/backend/packages/files/parsers/common.py similarity index 100% rename from backend/parsers/common.py rename to backend/packages/files/parsers/common.py diff --git a/backend/parsers/csv.py b/backend/packages/files/parsers/csv.py similarity index 100% rename from backend/parsers/csv.py rename to backend/packages/files/parsers/csv.py diff --git a/backend/parsers/docx.py b/backend/packages/files/parsers/docx.py similarity index 100% rename from backend/parsers/docx.py rename to backend/packages/files/parsers/docx.py diff --git a/backend/parsers/epub.py b/backend/packages/files/parsers/epub.py similarity index 100% rename from backend/parsers/epub.py rename to backend/packages/files/parsers/epub.py diff --git a/backend/parsers/github.py b/backend/packages/files/parsers/github.py similarity index 97% rename from backend/parsers/github.py rename to backend/packages/files/parsers/github.py index c4eb0de141e7..be1113611e3f 100644 --- a/backend/parsers/github.py +++ b/backend/packages/files/parsers/github.py @@ -5,7 +5,7 @@ from langchain.schema import Document from langchain.text_splitter import RecursiveCharacterTextSplitter from models import Brain, File -from utils.file import compute_sha1_from_content +from packages.files.file import compute_sha1_from_content from utils.vectors import Neurons diff --git a/backend/parsers/html.py b/backend/packages/files/parsers/html.py similarity index 100% rename from backend/parsers/html.py rename to backend/packages/files/parsers/html.py diff --git a/backend/parsers/markdown.py b/backend/packages/files/parsers/markdown.py similarity index 100% rename from backend/parsers/markdown.py rename to backend/packages/files/parsers/markdown.py diff --git a/backend/parsers/notebook.py b/backend/packages/files/parsers/notebook.py similarity index 100% rename from backend/parsers/notebook.py rename to backend/packages/files/parsers/notebook.py diff --git a/backend/parsers/odt.py b/backend/packages/files/parsers/odt.py similarity index 100% rename from backend/parsers/odt.py rename to backend/packages/files/parsers/odt.py diff --git a/backend/parsers/pdf.py b/backend/packages/files/parsers/pdf.py similarity index 100% rename from backend/parsers/pdf.py rename to backend/packages/files/parsers/pdf.py diff --git a/backend/parsers/powerpoint.py b/backend/packages/files/parsers/powerpoint.py similarity index 100% rename from backend/parsers/powerpoint.py rename to backend/packages/files/parsers/powerpoint.py diff --git a/backend/parsers/telegram.py b/backend/packages/files/parsers/telegram.py similarity index 100% rename from backend/parsers/telegram.py rename to backend/packages/files/parsers/telegram.py diff --git a/backend/parsers/txt.py b/backend/packages/files/parsers/txt.py similarity index 100% rename from backend/parsers/txt.py rename to backend/packages/files/parsers/txt.py diff --git a/backend/parsers/xlsx.py b/backend/packages/files/parsers/xlsx.py similarity index 100% rename from backend/parsers/xlsx.py rename to backend/packages/files/parsers/xlsx.py diff --git a/backend/utils/processors.py b/backend/packages/files/processors.py similarity index 81% rename from backend/utils/processors.py rename to backend/packages/files/processors.py index 0b6855fc65fe..51b03ff268fe 100644 --- a/backend/utils/processors.py +++ b/backend/packages/files/processors.py @@ -1,21 +1,21 @@ from models.brains import Brain -from models.files import File -from parsers.audio import process_audio -from parsers.code_python import process_python -from parsers.csv import process_csv -from parsers.docx import process_docx -from parsers.epub import process_epub -from parsers.html import process_html -from parsers.markdown import process_markdown -from parsers.notebook import process_ipnyb -from parsers.odt import process_odt -from parsers.pdf import process_pdf -from parsers.powerpoint import process_powerpoint -from parsers.telegram import process_telegram -from parsers.txt import process_txt -from parsers.xlsx import process_xlsx from repository.brain.get_brain_by_id import get_brain_by_id +from .parsers.audio import process_audio +from .parsers.code_python import process_python +from .parsers.csv import process_csv +from .parsers.docx import process_docx +from .parsers.epub import process_epub +from .parsers.html import process_html +from .parsers.markdown import process_markdown +from .parsers.notebook import process_ipnyb +from .parsers.odt import process_odt +from .parsers.pdf import process_pdf +from .parsers.powerpoint import process_powerpoint +from .parsers.telegram import process_telegram +from .parsers.txt import process_txt +from .parsers.xlsx import process_xlsx + file_processors = { ".txt": process_txt, ".csv": process_csv, @@ -46,8 +46,9 @@ def create_response(message, type): return {"message": message, "type": type} +# TODO: Move filter_file to a file service to avoid circular imports from models/files.py for File class async def filter_file( - file: File, + file, enable_summarization: bool, brain_id, openai_api_key, diff --git a/backend/routes/crawl_routes.py b/backend/routes/crawl_routes.py index 501e5a9e20f4..1f5b1f6e039f 100644 --- a/backend/routes/crawl_routes.py +++ b/backend/routes/crawl_routes.py @@ -10,9 +10,9 @@ from models.databases.supabase.knowledge import CreateKnowledgeProperties from models.databases.supabase.notifications import CreateNotificationProperties from models.notifications import NotificationsStatusEnum +from packages.files.file import convert_bytes from repository.knowledge.add_knowledge import add_knowledge from repository.notification.add_notification import add_notification -from utils.file import convert_bytes logger = get_logger(__name__) crawl_router = APIRouter() diff --git a/backend/routes/upload_routes.py b/backend/routes/upload_routes.py index d309c39b752d..d31ff5c1f8bc 100644 --- a/backend/routes/upload_routes.py +++ b/backend/routes/upload_routes.py @@ -10,6 +10,7 @@ from models.databases.supabase.knowledge import CreateKnowledgeProperties from models.databases.supabase.notifications import CreateNotificationProperties from models.notifications import NotificationsStatusEnum +from packages.files.file import convert_bytes, get_file_size from repository.brain import get_brain_details from repository.files.upload_file import upload_file_storage from repository.knowledge.add_knowledge import add_knowledge @@ -19,7 +20,6 @@ RoleEnum, validate_brain_authorization, ) -from utils.file import convert_bytes, get_file_size logger = get_logger(__name__) upload_router = APIRouter()