Skip to content

Commit

Permalink
Merge pull request #26 from weni-ai/feature/links
Browse files Browse the repository at this point in the history
load and index urls
  • Loading branch information
AlisoSouza authored Mar 28, 2024
2 parents 046774a + 9cdf2fb commit 7b4cc88
Show file tree
Hide file tree
Showing 8 changed files with 111 additions and 13 deletions.
4 changes: 4 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ COPY pyproject.toml poetry.lock ./
RUN poetry config virtualenvs.create false && \
poetry install --no-dev

RUN apt update && apt install libmagic1 -y
RUN poetry add python-magic
RUN python -m nltk.downloader punkt averaged_perceptron_tagger -d /usr/share/nltk_data

COPY . .

EXPOSE 8000
Expand Down
10 changes: 9 additions & 1 deletion app/handlers/nexus.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,18 @@ def __init__(self) -> None:

def index_succedded(self, task_succeded: bool, nexus_task_uuid: str, file_type: str) -> None:
endpoint = f'{self.base_url}/api/v1/content-base-file'

if file_type == "txt":
ftype = "text"
elif file_type == "urls":
ftype = "link"
else:
ftype = "file"

data = {
"status": int(task_succeded),
"task_uuid": nexus_task_uuid,
"file_type": "text" if file_type == "txt" else "file",
"file_type": ftype,
}
response = requests.patch(url=endpoint, data=json.dumps(data), headers=self.headers)
response.raise_for_status()
20 changes: 17 additions & 3 deletions app/loaders/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,24 @@

import os
from app.loaders.loaders import (DataLoader, txt_loader, pdf_loader, docx_loader, xlsx_loader, DataLoaderCls, PDFLoader, DocxLoader, TxtLoader, XlsxLoader)
from app.loaders.loaders import (
DataLoader,
txt_loader,
pdf_loader,
docx_loader,
xlsx_loader,
)
from app.loaders.loaders import (
DataLoaderCls,
PDFLoader,
DocxLoader,
TxtLoader,
XlsxLoader,
URLsLoader,
)
from langchain.schema.document import Document
from typing import List
from app.text_splitters import ITextSplitter

supported_loaders = {
'txt': txt_loader,
'pdf': pdf_loader,
Expand All @@ -20,9 +35,9 @@
'txt': TxtLoader,
'xlsx': XlsxLoader,
'xls': XlsxLoader,
'urls': URLsLoader,
}


def load_file_and_get_raw_text(file_name: str, file_type: str) -> str:
file_path = f'{os.environ.get("FILE_PATH")}/{file_name}'
loader = supported_loaders.get(file_type)
Expand All @@ -47,6 +62,5 @@ def load_file_url_and_split_text(file_url: str, file_type: str, text_splitter: I
load_type = kwargs.get("load_type", None)

loader = supported_loaders_cls.get(file_type)

data_loader = DataLoaderCls(loader=loader, file=file_url, load_type=load_type)
return data_loader.load_and_split_text(text_splitter)
37 changes: 32 additions & 5 deletions app/loaders/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,10 @@

from langchain.document_loaders import (
TextLoader, PyPDFLoader, UnstructuredExcelLoader,
UnstructuredWordDocumentLoader, Docx2txtLoader,
PDFMinerLoader
UnstructuredWordDocumentLoader, Docx2txtLoader, UnstructuredURLLoader, PDFMinerLoader
)
from langchain.schema.document import Document
from typing import Callable, List
from typing import Callable, List, Union
from app.text_splitters import ITextSplitter


Expand Down Expand Up @@ -65,7 +64,7 @@ def txt_loader(file: str) -> Callable:

class TxtLoader(DocumentLoader):
def _get_file(self, file: str):
if os.environ.get("AWS_STORAGE_BUCKET_NAME") in file:
if os.environ.get("AWS_STORAGE_BUCKET_NAME") in file: # pragma: no cover
response = requests.get(file)
if response.status_code == 200:
file_path = f"/tmp/{uuid.uuid4()}.txt"
Expand Down Expand Up @@ -189,7 +188,7 @@ def __init__(self, file:str) -> None:
tmp_file, _ = self._get_temp_file(file)
self.loader = UnstructuredExcelLoader(tmp_file, mode="single")

def _get_temp_file(self, file_url: str):
def _get_temp_file(self, file_url: str): # pragma: no cover
result = urlparse(file_url)
filename = result.path.strip("/")
file_path, message = urlretrieve(file_url, f"/tmp/{filename}")
Expand All @@ -211,3 +210,31 @@ def load_and_split_text(self, text_splitter: ITextSplitter) -> List[Document]:
for chunk in text_chunks:
split_pages.append(Document(page_content=chunk, metadata=metadatas))
return split_pages


class URLsLoader(DocumentLoader):
def _urls(self, urls: Union[List[str], str]):
if isinstance(urls, str):
return [urls]
return urls

def __init__(self, urls: Union[List[str], str]) -> None:
self.urls = self._urls(urls)
self.loader = UnstructuredURLLoader(urls=self.urls)

def load(self) -> List[Document]:
return self.loader.load()

def load_and_split_text(self, text_splitter: ITextSplitter) -> List[Document]:
split_pages = []

pages = self.loader.load_and_split()
for page in pages:
page_content = page.page_content.lower()
metadatas = page.metadata
metadatas.update({"full_page": page_content})

text_chunks = text_splitter.split_text(page_content)
for chunk in text_chunks:
split_pages.append(Document(page_content=chunk, metadata=metadatas))
return split_pages
27 changes: 27 additions & 0 deletions app/tests/test_document_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
PDFLoader,
DocxLoader,
TxtLoader,
URLsLoader,
XlsxLoader,
pdf_loader,
txt_loader,
docx_loader,
Expand Down Expand Up @@ -122,18 +124,43 @@ def test_load_xlsx(self):
raw_text = data_loader.raw_text()
self.assertEqual(type(raw_text), str)

@mock.patch("app.loaders.loaders.XlsxLoader._get_temp_file")
def test_load_xlsx_cls(self, mock_file_url):
file_path = f'{self.path}/{self.file_name}.xlsx'
mock_file_url.return_value = (file_path, "")
xlsx_loader = XlsxLoader(file_path)
split_pages: List[Document] = xlsx_loader.load_and_split_text(self.text_splitter)
self.assertEqual(list, type(split_pages))

def test_pdf_loader_cls(self):
file_path = f'{self.path}/{self.file_name}.pdf'
pdf_loader = PDFLoader(file_path)
split_pages: List[Document] = pdf_loader.load_and_split_text(self.text_splitter)
self.assertEqual(list, type(split_pages))

def test_urls_loader_cls(self):
urls_loader = URLsLoader("https://en.wikipedia.org/wiki/Unit_testing")
split_pages: List[Document] = urls_loader.load()
self.assertEqual(list, type(split_pages))

def test_urls_loader_and_split_cls(self):
urls_loader = URLsLoader("https://en.wikipedia.org/wiki/Unit_testing")
split_pages: List[Document] = urls_loader.load_and_split_text(self.text_splitter)
self.assertEqual(list, type(split_pages))

def test_urls_list_loader_and_split_cls(self):
urls = ["https://en.wikipedia.org/wiki/Unit_testing"]
urls_loader = URLsLoader(urls)
split_pages: List[Document] = urls_loader.load_and_split_text(self.text_splitter)
self.assertEqual(list, type(split_pages))

def test_docx_loader_cls(self):
file_path = f'{self.path}/{self.file_name}.docx'
docx_loader = DocxLoader(file_path)
split_pages: List[Document] = docx_loader.load_and_split_text(self.text_splitter)
self.assertEqual(list, type(split_pages))

@mock.patch.dict(os.environ, {"AWS_STORAGE_BUCKET_NAME": "file-path"})
def test_txt_loader_cls(self):
file_path = f'{self.path}/{self.file_name}.txt'
docx_loader = TxtLoader(file_path)
Expand Down
4 changes: 2 additions & 2 deletions app/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@ class ContentHandler(EmbeddingsContentHandler):
content_type = "application/json"
accepts = "application/json"

def transform_input(self, inputs: list[str], model_kwargs: dict) -> bytes:
def transform_input(self, inputs: list[str], model_kwargs: dict) -> bytes: # pragma: no cover
input_str = json.dumps({"inputs": inputs, **model_kwargs})
return input_str.encode("utf-8")

def transform_output(self, output: bytes) -> list[list[float]]:
def transform_output(self, output: bytes) -> list[list[float]]: # pragma: no cover
response_json = json.loads(output.read().decode("utf-8"))
return response_json["vectors"]

Expand Down
21 changes: 19 additions & 2 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ pydantic = "2.3.0"
celery = "^5.3.6"
redis = "^5.0.1"
sentry-sdk = {extras = ["fastapi"], version = "^1.35.0"}
xlrd = "^2.0.1"
pdfminer-six = "^20231228"


Expand Down

0 comments on commit 7b4cc88

Please sign in to comment.