Skip to content

Commit

Permalink
Merge pull request #19 from weni-ai/feature/document-loader-cls
Browse files Browse the repository at this point in the history
add txt and docx class loaders
  • Loading branch information
AlisoSouza authored Jan 31, 2024
2 parents 56b2099 + 72c1a56 commit a21e2a1
Show file tree
Hide file tree
Showing 10 changed files with 149 additions and 13 deletions.
6 changes: 5 additions & 1 deletion app/celery.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ def index_file_data(content_base: Dict) -> bool:
text_splitter = TextSplitter(character_text_splitter())
manager = IndexerFileManager(file_downloader, main_app.content_base_indexer, text_splitter)
index_result: bool = manager.index_file_url(content_base)
NexusRESTClient().index_succedded(task_succeded=index_result, nexus_task_uuid=content_base.get("task_uuid"))
NexusRESTClient().index_succedded(
task_succeded=index_result,
nexus_task_uuid=content_base.get("task_uuid"),
file_type=content_base.get("extension_file")
)

return index_result
2 changes: 1 addition & 1 deletion app/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,5 +39,5 @@ def __init__(self):
}

self.content_base_index_name = os.environ.get(
"INDEX_PRODUCTS_NAME", "content_bases"
"INDEX_CONTENTBASES_NAME", "content_bases"
)
7 changes: 6 additions & 1 deletion app/handlers/content_bases.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
class ContentBaseIndexRequest(BaseModel):
file: str
filename: str
file_uuid: str
extension_file: str
task_uuid: str
content_base: str
Expand All @@ -37,10 +38,13 @@ class ContentBaseSearchResponse(BaseModel):
class ContentBaseDeleteRequest(BaseModel):
filename: str
content_base: str
file_uuid: str


class ContentBaseDeleteResponse(BaseModel):
deleted: bool


class ContentBaseHandler(IDocumentHandler):
def __init__(self, content_base_indexer: IDocumentIndexer):
self.content_base_indexer = content_base_indexer
Expand Down Expand Up @@ -73,7 +77,8 @@ def delete(self, request: ContentBaseDeleteRequest, Authorization: Annotated[str
token_verification(Authorization)
self.content_base_indexer.delete(
request.content_base,
request.filename
request.filename,
request.file_uuid,
)
return ContentBaseDeleteResponse(deleted=True)

Expand Down
3 changes: 2 additions & 1 deletion app/handlers/nexus.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,12 @@ def __init__(self) -> None:
'Content-Type': "application/json"
}

def index_succedded(self, task_succeded: bool, nexus_task_uuid: str) -> None:
def index_succedded(self, task_succeded: bool, nexus_task_uuid: str, file_type: str) -> None:
endpoint = f'{self.base_url}/api/v1/content-base-file'
data = {
"status": int(task_succeded),
"task_uuid": nexus_task_uuid,
"file_type": "text" if file_type == "txt" else "file",
}
response = requests.patch(url=endpoint, data=json.dumps(data), headers=self.headers)
response.raise_for_status()
23 changes: 20 additions & 3 deletions app/indexer/content_bases.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,18 @@ def __init__(self, storage: IStorage):
self.storage = storage

def index_documents(self, docs: List[Document]):
file_uuid = docs[0].metadata["file_uuid"]
content_base_uuid = docs[0].metadata["content_base_uuid"]

results = self._search_docs_by_content_base_uuid(
content_base_uuid=content_base_uuid,
file_uuid=file_uuid,
)
ids = []
if len(results) > 0:
ids = [item["_id"] for item in results]
self.storage.delete(ids=ids)

return self.storage.save(docs)

def index(self, texts: List, metadatas: dict):
Expand All @@ -37,18 +49,23 @@ def search(self, search, filter=None, threshold=0.1) -> list[Product]:
matched_responses = self.storage.search(search, filter, threshold)
return set([doc.metadata.get("full_page") for doc in matched_responses])

def _search_docs_by_content_base_uuid(self, content_base_uuid: UUID):
def _search_docs_by_content_base_uuid(self, content_base_uuid: UUID, file_uuid: str = None):
search_filter = {
"metadata.content_base_uuid": content_base_uuid
}
if file_uuid:
search_filter.update({"metadata.file_uuid": file_uuid})
return self.storage.query_search(search_filter)

def delete(self, content_base_uuid: UUID, filename: str):
def delete(self, content_base_uuid: UUID, filename: str, file_uuid: str):
search_filter = {
"metadata.content_base_uuid": content_base_uuid,
"metadata.source": filename,
"metadata.file_uuid": file_uuid,
}

if filename:
search_filter.update({"metadata.source": filename})

scroll_id, results = self.storage.search_delete(search_filter)
ids = []

Expand Down
1 change: 1 addition & 0 deletions app/indexer/indexer_file_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ def add_file_metadata(document_pages: List[Document], content_base: Dict) -> Lis
metadata = {
"content_base_uuid": str(content_base.get('content_base')),
"filename": content_base.get("filename"),
"file_uuid": content_base.get("file_uuid")

}
for page in document_pages:
Expand Down
9 changes: 7 additions & 2 deletions app/loaders/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@

import os
from app.loaders.loaders import (DataLoader, txt_loader, pdf_loader, docx_loader, xlsx_loader, DataLoaderCls, PDFLoader)
from app.loaders.loaders import (DataLoader, txt_loader, pdf_loader, docx_loader, xlsx_loader, DataLoaderCls, PDFLoader, DocxLoader, TxtLoader, XlsxLoader)
from langchain.schema.document import Document
from typing import List
from app.text_splitters import ITextSplitter
Expand All @@ -14,7 +14,12 @@
}

supported_loaders_cls = {
'pdf': PDFLoader
'pdf': PDFLoader,
'doc': DocxLoader,
'docx': DocxLoader,
'txt': TxtLoader,
'xlsx': XlsxLoader,
'xls': XlsxLoader,
}


Expand Down
91 changes: 89 additions & 2 deletions app/loaders/loaders.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
import os
import uuid
import requests
from abc import ABC, abstractmethod

from langchain.document_loaders import (
TextLoader, PyPDFLoader, UnstructuredExcelLoader,
UnstructuredWordDocumentLoader, Docx2txtLoader
)
from typing import Callable, List
from langchain.schema.document import Document
from abc import ABC, abstractmethod
from typing import Callable, List
from app.text_splitters import ITextSplitter


Expand Down Expand Up @@ -52,6 +56,38 @@ def txt_loader(file: str) -> Callable:
loader = TextLoader(file)
return loader.load()

class TxtLoader(DocumentLoader):
def _get_file(self, file: str):
if os.environ.get("AWS_STORAGE_BUCKET_NAME") in file:
response = requests.get(file)
if response.status_code == 200:
file_path = f"/tmp/{uuid.uuid4()}.txt"
text = response.text
with open(file_path, "w") as file:
file.write(text)
return file_path
return file

def __init__(self, file:str) -> None:
self.file = self._get_file(file)
self.loader = TextLoader(self.file)

def load(self) -> List[Document]:
return self.loader.load_and_split()

def load_and_split_text(self, text_splitter: ITextSplitter) -> List[Document]:
pages = self.load()
split_pages = []
for page in pages:
page_content = page.page_content.lower()
metadatas = page.metadata
metadatas.update({"full_page": page_content})

text_chunks = text_splitter.split_text(page_content)
for chunk in text_chunks:
split_pages.append(Document(page_content=chunk, metadata=metadatas))
return split_pages


class PDFLoader(DocumentLoader):
def __init__(self, file: str) -> None:
Expand Down Expand Up @@ -92,6 +128,27 @@ def pdf_loader(file: str) -> Callable:
return pages


class DocxLoader(DocumentLoader):
def __init__(self, file:str) -> None:
self.loader = Docx2txtLoader(file)

def load(self) -> List[Document]:
return self.loader.load_and_split()

def load_and_split_text(self, text_splitter: ITextSplitter) -> List[Document]:
pages = self.load()
split_pages = []
for page in pages:
page_content = page.page_content.lower()
metadatas = page.metadata
metadatas.update({"full_page": page_content})

text_chunks = text_splitter.split_text(page_content)
for chunk in text_chunks:
split_pages.append(Document(page_content=chunk, metadata=metadatas))
return split_pages


def docx_loader(file: str) -> Callable:
loader = Docx2txtLoader(file)
return loader.load()
Expand All @@ -107,3 +164,33 @@ def xlsx_loader(file: str) -> Callable:
"""Loads .xlsx and .xls files"""
loader = UnstructuredExcelLoader(file, mode="elements")
return loader.load()

from urllib.request import urlretrieve
from urllib.parse import urlparse
class XlsxLoader(DocumentLoader):
def __init__(self, file:str) -> None:
tmp_file, _ = self._get_temp_file(file)
self.loader = UnstructuredExcelLoader(tmp_file, mode="single")

def _get_temp_file(self, file_url: str):
result = urlparse(file_url)
filename = result.path.strip("/")
file_path, message = urlretrieve(file_url, f"/tmp/{filename}")
return file_path, message


def load(self) -> List[Document]:
return self.loader.load_and_split()

def load_and_split_text(self, text_splitter: ITextSplitter) -> List[Document]:
pages = self.load()
split_pages = []
for page in pages:
page_content = page.page_content.lower()
metadatas = page.metadata
metadatas.update({"full_page": page_content})

text_chunks = text_splitter.split_text(page_content)
for chunk in text_chunks:
split_pages.append(Document(page_content=chunk, metadata=metadatas))
return split_pages
6 changes: 4 additions & 2 deletions app/store/elasticsearch_vector_store.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import os

from langchain.vectorstores import VectorStore
from langchain.docstore.document import Document

from app.store import IStorage

from fastapi.logger import logger
class ElasticsearchVectorStoreIndex(IStorage):
def __init__(self, vectorstore: VectorStore, score=1.55):
Expand Down Expand Up @@ -55,7 +56,8 @@ def delete(self, ids: list[str] = []) -> bool:
class ContentBaseElasticsearchVectorStoreIndex(ElasticsearchVectorStoreIndex):

def save(self, docs: list[Document])-> list[str]:
res = self.vectorstore.from_documents(docs, self.vectorstore.embeddings, index_name="content_bases")
index = os.environ.get("INDEX_CONTENTBASES_NAME", "content_bases")
res = self.vectorstore.from_documents(docs, self.vectorstore.embeddings, index_name=index)
return res

def query_search(self, search_filter: dict) -> list[dict]:
Expand Down
14 changes: 14 additions & 0 deletions app/tests/test_document_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
DataLoader,
DataLoaderCls,
PDFLoader,
DocxLoader,
TxtLoader,
pdf_loader,
txt_loader,
docx_loader,
Expand Down Expand Up @@ -126,6 +128,18 @@ def test_pdf_loader_cls(self):
split_pages: List[Document] = pdf_loader.load_and_split_text(self.text_splitter)
self.assertEqual(list, type(split_pages))

def test_docx_loader_cls(self):
file_path = f'{self.path}/{self.file_name}.docx'
docx_loader = DocxLoader(file_path)
split_pages: List[Document] = docx_loader.load_and_split_text(self.text_splitter)
self.assertEqual(list, type(split_pages))

def test_txt_loader_cls(self):
file_path = f'{self.path}/{self.file_name}.txt'
docx_loader = TxtLoader(file_path)
split_pages: List[Document] = docx_loader.load_and_split_text(self.text_splitter)
self.assertEqual(list, type(split_pages))

def test_load_file_url_and_split_text(self):
file_path = f'{self.path}/{self.file_name}.pdf'
file_type = "pdf"
Expand Down

0 comments on commit a21e2a1

Please sign in to comment.