From 8bdbf8e21feb304ad1a7c8ff27a5e3a39145ea6b Mon Sep 17 00:00:00 2001 From: Alisson Date: Thu, 18 Jan 2024 11:58:55 -0300 Subject: [PATCH 01/12] add txt and docx class loaders --- app/loaders/__init__.py | 7 ++++-- app/loaders/loaders.py | 41 +++++++++++++++++++++++++++++++ app/tests/test_document_loader.py | 14 +++++++++++ 3 files changed, 60 insertions(+), 2 deletions(-) diff --git a/app/loaders/__init__.py b/app/loaders/__init__.py index 220f5de..42d750b 100644 --- a/app/loaders/__init__.py +++ b/app/loaders/__init__.py @@ -1,6 +1,6 @@ import os -from app.loaders.loaders import (DataLoader, txt_loader, pdf_loader, docx_loader, xlsx_loader, DataLoaderCls, PDFLoader) +from app.loaders.loaders import (DataLoader, txt_loader, pdf_loader, docx_loader, xlsx_loader, DataLoaderCls, PDFLoader, DocxLoader, TxtLoader) from langchain.schema.document import Document from typing import List from app.text_splitters import ITextSplitter @@ -14,7 +14,10 @@ } supported_loaders_cls = { - 'pdf': PDFLoader + 'pdf': PDFLoader, + 'doc': DocxLoader, + 'docx': DocxLoader, + 'txt': TxtLoader, } diff --git a/app/loaders/loaders.py b/app/loaders/loaders.py index 777c2a9..a40c887 100644 --- a/app/loaders/loaders.py +++ b/app/loaders/loaders.py @@ -52,6 +52,26 @@ def txt_loader(file: str) -> Callable: loader = TextLoader(file) return loader.load() +class TxtLoader(DocumentLoader): + def __init__(self, file:str) -> None: + self.loader = TextLoader(file) + + def load(self) -> List[Document]: + return self.loader.load_and_split() + + def load_and_split_text(self, text_splitter: ITextSplitter) -> List[Document]: + pages = self.load() + split_pages = [] + for page in pages: + page_content = page.page_content.lower() + metadatas = page.metadata + metadatas.update({"full_page": page_content}) + + text_chunks = text_splitter.split_text(page_content) + for chunk in text_chunks: + split_pages.append(Document(page_content=chunk, metadata=metadatas)) + return split_pages + class PDFLoader(DocumentLoader): def __init__(self, file: str) -> None: @@ -92,6 +112,27 @@ def pdf_loader(file: str) -> Callable: return pages +class DocxLoader(DocumentLoader): + def __init__(self, file:str) -> None: + self.loader = UnstructuredWordDocumentLoader(file) + + def load(self) -> List[Document]: + return self.loader.load_and_split() + + def load_and_split_text(self, text_splitter: ITextSplitter) -> List[Document]: + pages = self.load() + split_pages = [] + for page in pages: + page_content = page.page_content.lower() + metadatas = page.metadata + metadatas.update({"full_page": page_content}) + + text_chunks = text_splitter.split_text(page_content) + for chunk in text_chunks: + split_pages.append(Document(page_content=chunk, metadata=metadatas)) + return split_pages + + def docx_loader(file: str) -> Callable: loader = Docx2txtLoader(file) return loader.load() diff --git a/app/tests/test_document_loader.py b/app/tests/test_document_loader.py index 1e35151..bc6a96d 100644 --- a/app/tests/test_document_loader.py +++ b/app/tests/test_document_loader.py @@ -3,6 +3,8 @@ DataLoader, DataLoaderCls, PDFLoader, + DocxLoader, + TxtLoader, pdf_loader, txt_loader, docx_loader, @@ -126,6 +128,18 @@ def test_pdf_loader_cls(self): split_pages: List[Document] = pdf_loader.load_and_split_text(self.text_splitter) self.assertEqual(list, type(split_pages)) + def test_docx_loader_cls(self): + file_path = f'{self.path}/{self.file_name}.docx' + docx_loader = DocxLoader(file_path) + split_pages: List[Document] = docx_loader.load_and_split_text(self.text_splitter) + self.assertEqual(list, type(split_pages)) + + def test_txt_loader_cls(self): + file_path = f'{self.path}/{self.file_name}.txt' + docx_loader = TxtLoader(file_path) + split_pages: List[Document] = docx_loader.load_and_split_text(self.text_splitter) + self.assertEqual(list, type(split_pages)) + def test_load_file_url_and_split_text(self): file_path = f'{self.path}/{self.file_name}.pdf' file_type = "pdf" From 288d17606ba4cbacc1689f8a989e045055b679ab Mon Sep 17 00:00:00 2001 From: Alisson Date: Fri, 19 Jan 2024 18:14:46 -0300 Subject: [PATCH 02/12] ajust txt loader to save file temp --- app/loaders/loaders.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/app/loaders/loaders.py b/app/loaders/loaders.py index a40c887..7ad578e 100644 --- a/app/loaders/loaders.py +++ b/app/loaders/loaders.py @@ -1,10 +1,14 @@ +import os +import uuid +import requests +from abc import ABC, abstractmethod + from langchain.document_loaders import ( TextLoader, PyPDFLoader, UnstructuredExcelLoader, UnstructuredWordDocumentLoader, Docx2txtLoader ) -from typing import Callable, List from langchain.schema.document import Document -from abc import ABC, abstractmethod +from typing import Callable, List from app.text_splitters import ITextSplitter @@ -53,8 +57,20 @@ def txt_loader(file: str) -> Callable: return loader.load() class TxtLoader(DocumentLoader): + def _get_file(self, file: str): + if os.environ.get("AWS_STORAGE_BUCKET_NAME") in file: + response = requests.get(file) + if response.status_code == 200: + file_path = f"/tmp/{uuid.uuid4()}.txt" + text = response.text + with open(file_path, "w") as file: + file.write(text) + return file_path + return file + def __init__(self, file:str) -> None: - self.loader = TextLoader(file) + self.file = self._get_file(file) + self.loader = TextLoader(self.file) def load(self) -> List[Document]: return self.loader.load_and_split() From 2a24bc71192eeb6cabd35371f286f3a62502e35d Mon Sep 17 00:00:00 2001 From: Alisson Date: Mon, 22 Jan 2024 16:09:35 -0300 Subject: [PATCH 03/12] send file type in the request --- app/celery.py | 6 +++++- app/handlers/nexus.py | 3 ++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/app/celery.py b/app/celery.py index a8c8de2..5766982 100644 --- a/app/celery.py +++ b/app/celery.py @@ -29,6 +29,10 @@ def index_file_data(content_base: Dict) -> bool: text_splitter = TextSplitter(character_text_splitter()) manager = IndexerFileManager(file_downloader, main_app.content_base_indexer, text_splitter) index_result: bool = manager.index_file_url(content_base) - NexusRESTClient().index_succedded(task_succeded=index_result, nexus_task_uuid=content_base.get("task_uuid")) + NexusRESTClient().index_succedded( + task_succeded=index_result, + nexus_task_uuid=content_base.get("task_uuid"), + file_type=content_base.get("extension_file") + ) return index_result diff --git a/app/handlers/nexus.py b/app/handlers/nexus.py index 5c378aa..34ea2f7 100644 --- a/app/handlers/nexus.py +++ b/app/handlers/nexus.py @@ -14,11 +14,12 @@ def __init__(self) -> None: 'Content-Type': "application/json" } - def index_succedded(self, task_succeded: bool, nexus_task_uuid: str) -> None: + def index_succedded(self, task_succeded: bool, nexus_task_uuid: str, file_type: str) -> None: endpoint = f'{self.base_url}/api/v1/content-base-file' data = { "status": int(task_succeded), "task_uuid": nexus_task_uuid, + "file_type": file_type, } response = requests.patch(url=endpoint, data=json.dumps(data), headers=self.headers) response.raise_for_status() From 4b019c5b67591232d20dee5481dd8f25f7c61b5c Mon Sep 17 00:00:00 2001 From: Alisson Date: Mon, 22 Jan 2024 16:38:26 -0300 Subject: [PATCH 04/12] send text of file in file_type --- app/handlers/nexus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/handlers/nexus.py b/app/handlers/nexus.py index 34ea2f7..1ea94d8 100644 --- a/app/handlers/nexus.py +++ b/app/handlers/nexus.py @@ -19,7 +19,7 @@ def index_succedded(self, task_succeded: bool, nexus_task_uuid: str, file_type: data = { "status": int(task_succeded), "task_uuid": nexus_task_uuid, - "file_type": file_type, + "file_type": "text" if file_type == "txt" else "file", } response = requests.patch(url=endpoint, data=json.dumps(data), headers=self.headers) response.raise_for_status() From b4953aef256e7b23157c15be9647198bfa926a74 Mon Sep 17 00:00:00 2001 From: Alisson Date: Thu, 25 Jan 2024 11:15:03 -0300 Subject: [PATCH 05/12] fix INDEX_CONTENTBASES_NAME env var --- app/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/config.py b/app/config.py index 09b94e3..9f77593 100644 --- a/app/config.py +++ b/app/config.py @@ -39,5 +39,5 @@ def __init__(self): } self.content_base_index_name = os.environ.get( - "INDEX_PRODUCTS_NAME", "content_bases" + "INDEX_CONTENTBASES_NAME", "content_bases" ) From adc476b89e2d71cca3514ecf0101b87940c21049 Mon Sep 17 00:00:00 2001 From: Alisson Date: Mon, 29 Jan 2024 16:01:11 -0300 Subject: [PATCH 06/12] add file_uuid to metadata --- app/handlers/content_bases.py | 1 + app/indexer/indexer_file_manager.py | 1 + 2 files changed, 2 insertions(+) diff --git a/app/handlers/content_bases.py b/app/handlers/content_bases.py index b60c111..a7cb999 100644 --- a/app/handlers/content_bases.py +++ b/app/handlers/content_bases.py @@ -13,6 +13,7 @@ class ContentBaseIndexRequest(BaseModel): file: str filename: str + file_uuid: str extension_file: str task_uuid: str content_base: str diff --git a/app/indexer/indexer_file_manager.py b/app/indexer/indexer_file_manager.py index b4e28b7..c5f093c 100644 --- a/app/indexer/indexer_file_manager.py +++ b/app/indexer/indexer_file_manager.py @@ -18,6 +18,7 @@ def add_file_metadata(document_pages: List[Document], content_base: Dict) -> Lis metadata = { "content_base_uuid": str(content_base.get('content_base')), "filename": content_base.get("filename"), + "file_uuid": content_base.get("file_uuid") } for page in document_pages: From 5f7cc9cbb38e368ee74e3e63330d78deb0cf79a7 Mon Sep 17 00:00:00 2001 From: Alisson Date: Mon, 29 Jan 2024 16:16:28 -0300 Subject: [PATCH 07/12] update elasticsearch vectors, search by file_uuid --- app/indexer/content_bases.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/app/indexer/content_bases.py b/app/indexer/content_bases.py index 5310087..e5d6f1f 100644 --- a/app/indexer/content_bases.py +++ b/app/indexer/content_bases.py @@ -12,6 +12,18 @@ def __init__(self, storage: IStorage): self.storage = storage def index_documents(self, docs: List[Document]): + file_uuid = docs[0].metadata["file_uuid"] + content_base_uuid = docs[0].metadata["content_base_uuid"] + + results = self._search_docs_by_content_base_uuid( + content_base_uuid=content_base_uuid, + file_uuid=file_uuid, + ) + ids = [] + if len(results) > 0: + ids = [item["_id"] for item in results] + self.storage.delete(ids=ids) + return self.storage.save(docs) def index(self, texts: List, metadatas: dict): @@ -37,10 +49,12 @@ def search(self, search, filter=None, threshold=0.1) -> list[Product]: matched_responses = self.storage.search(search, filter, threshold) return set([doc.metadata.get("full_page") for doc in matched_responses]) - def _search_docs_by_content_base_uuid(self, content_base_uuid: UUID): + def _search_docs_by_content_base_uuid(self, content_base_uuid: UUID, file_uuid: str = None): search_filter = { "metadata.content_base_uuid": content_base_uuid } + if file_uuid: + search_filter.update({"metadata.file_uuid": file_uuid}) return self.storage.query_search(search_filter) def delete(self, content_base_uuid: UUID, filename: str): From 6d3ab6462115c6e12f8b725bc13e7074528c7bb7 Mon Sep 17 00:00:00 2001 From: Alisson Date: Mon, 29 Jan 2024 17:54:54 -0300 Subject: [PATCH 08/12] delete file by uuid --- app/handlers/content_bases.py | 6 +++++- app/indexer/content_bases.py | 7 +++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/app/handlers/content_bases.py b/app/handlers/content_bases.py index a7cb999..f9f021c 100644 --- a/app/handlers/content_bases.py +++ b/app/handlers/content_bases.py @@ -38,10 +38,13 @@ class ContentBaseSearchResponse(BaseModel): class ContentBaseDeleteRequest(BaseModel): filename: str content_base: str + file_uuid: str + class ContentBaseDeleteResponse(BaseModel): deleted: bool + class ContentBaseHandler(IDocumentHandler): def __init__(self, content_base_indexer: IDocumentIndexer): self.content_base_indexer = content_base_indexer @@ -74,7 +77,8 @@ def delete(self, request: ContentBaseDeleteRequest, Authorization: Annotated[str token_verification(Authorization) self.content_base_indexer.delete( request.content_base, - request.filename + request.filename, + request.file_uuid, ) return ContentBaseDeleteResponse(deleted=True) diff --git a/app/indexer/content_bases.py b/app/indexer/content_bases.py index e5d6f1f..c14f7bf 100644 --- a/app/indexer/content_bases.py +++ b/app/indexer/content_bases.py @@ -57,12 +57,15 @@ def _search_docs_by_content_base_uuid(self, content_base_uuid: UUID, file_uuid: search_filter.update({"metadata.file_uuid": file_uuid}) return self.storage.query_search(search_filter) - def delete(self, content_base_uuid: UUID, filename: str): + def delete(self, content_base_uuid: UUID, filename: str, file_uuid: str): search_filter = { "metadata.content_base_uuid": content_base_uuid, - "metadata.source": filename, + "metadata.file_uuid": file_uuid, } + if filename: + search_filter.update({"metadata.source": filename}) + scroll_id, results = self.storage.search_delete(search_filter) ids = [] From 257cdf571dfc93dacc45bb0b64a650d845a0f519 Mon Sep 17 00:00:00 2001 From: Alisson Date: Tue, 30 Jan 2024 14:21:13 -0300 Subject: [PATCH 09/12] index as environment variable --- app/store/elasticsearch_vector_store.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/app/store/elasticsearch_vector_store.py b/app/store/elasticsearch_vector_store.py index 9800143..95124f9 100644 --- a/app/store/elasticsearch_vector_store.py +++ b/app/store/elasticsearch_vector_store.py @@ -1,8 +1,9 @@ +import os + from langchain.vectorstores import VectorStore from langchain.docstore.document import Document from app.store import IStorage - from fastapi.logger import logger class ElasticsearchVectorStoreIndex(IStorage): def __init__(self, vectorstore: VectorStore, score=1.55): @@ -55,7 +56,8 @@ def delete(self, ids: list[str] = []) -> bool: class ContentBaseElasticsearchVectorStoreIndex(ElasticsearchVectorStoreIndex): def save(self, docs: list[Document])-> list[str]: - res = self.vectorstore.from_documents(docs, self.vectorstore.embeddings, index_name="content_bases") + index = os.environ.get("INDEX_CONTENTBASES_NAME", "content_bases") + res = self.vectorstore.from_documents(docs, self.vectorstore.embeddings, index_name=index) return res def query_search(self, search_filter: dict) -> list[dict]: From f980c535239675813c9c9752a9b512c9ca744fbc Mon Sep 17 00:00:00 2001 From: Alisson Date: Tue, 30 Jan 2024 15:40:51 -0300 Subject: [PATCH 10/12] change docx loader --- app/loaders/loaders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/loaders/loaders.py b/app/loaders/loaders.py index 7ad578e..0c6e530 100644 --- a/app/loaders/loaders.py +++ b/app/loaders/loaders.py @@ -130,7 +130,7 @@ def pdf_loader(file: str) -> Callable: class DocxLoader(DocumentLoader): def __init__(self, file:str) -> None: - self.loader = UnstructuredWordDocumentLoader(file) + self.loader = Docx2txtLoader(file) def load(self) -> List[Document]: return self.loader.load_and_split() From 66831572870075386c62eb266c480b5a5847cadb Mon Sep 17 00:00:00 2001 From: Alisson Date: Tue, 30 Jan 2024 18:51:32 -0300 Subject: [PATCH 11/12] add: xlsx and xls support --- app/loaders/__init__.py | 4 +++- app/loaders/loaders.py | 20 ++++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/app/loaders/__init__.py b/app/loaders/__init__.py index 42d750b..a119461 100644 --- a/app/loaders/__init__.py +++ b/app/loaders/__init__.py @@ -1,6 +1,6 @@ import os -from app.loaders.loaders import (DataLoader, txt_loader, pdf_loader, docx_loader, xlsx_loader, DataLoaderCls, PDFLoader, DocxLoader, TxtLoader) +from app.loaders.loaders import (DataLoader, txt_loader, pdf_loader, docx_loader, xlsx_loader, DataLoaderCls, PDFLoader, DocxLoader, TxtLoader, XlsxLoader) from langchain.schema.document import Document from typing import List from app.text_splitters import ITextSplitter @@ -18,6 +18,8 @@ 'doc': DocxLoader, 'docx': DocxLoader, 'txt': TxtLoader, + 'xlsx': XlsxLoader, + 'xls': XlsxLoader, } diff --git a/app/loaders/loaders.py b/app/loaders/loaders.py index 0c6e530..10dc853 100644 --- a/app/loaders/loaders.py +++ b/app/loaders/loaders.py @@ -164,3 +164,23 @@ def xlsx_loader(file: str) -> Callable: """Loads .xlsx and .xls files""" loader = UnstructuredExcelLoader(file, mode="elements") return loader.load() + +class XlsxLoader(DocumentLoader): + def __init__(self, file:str) -> None: + self.loader = UnstructuredExcelLoader(file, mode="single") + + def load(self) -> List[Document]: + return self.loader.load_and_split() + + def load_and_split_text(self, text_splitter: ITextSplitter) -> List[Document]: + pages = self.load() + split_pages = [] + for page in pages: + page_content = page.page_content.lower() + metadatas = page.metadata + metadatas.update({"full_page": page_content}) + + text_chunks = text_splitter.split_text(page_content) + for chunk in text_chunks: + split_pages.append(Document(page_content=chunk, metadata=metadatas)) + return split_pages From b575f23cda666713c64875b4a2c3e80c09ab3c6c Mon Sep 17 00:00:00 2001 From: Alisson Date: Tue, 30 Jan 2024 19:20:44 -0300 Subject: [PATCH 12/12] xlsx: save temp file --- app/loaders/loaders.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/app/loaders/loaders.py b/app/loaders/loaders.py index 10dc853..bffe77a 100644 --- a/app/loaders/loaders.py +++ b/app/loaders/loaders.py @@ -165,9 +165,19 @@ def xlsx_loader(file: str) -> Callable: loader = UnstructuredExcelLoader(file, mode="elements") return loader.load() +from urllib.request import urlretrieve +from urllib.parse import urlparse class XlsxLoader(DocumentLoader): def __init__(self, file:str) -> None: - self.loader = UnstructuredExcelLoader(file, mode="single") + tmp_file, _ = self._get_temp_file(file) + self.loader = UnstructuredExcelLoader(tmp_file, mode="single") + + def _get_temp_file(self, file_url: str): + result = urlparse(file_url) + filename = result.path.strip("/") + file_path, message = urlretrieve(file_url, f"/tmp/{filename}") + return file_path, message + def load(self) -> List[Document]: return self.loader.load_and_split()