From 616412e9d3ce3e736a84e8863104e554d2cc2daf Mon Sep 17 00:00:00 2001 From: Alisson Date: Mon, 4 Mar 2024 12:39:07 -0300 Subject: [PATCH 1/7] load and index urls --- Dockerfile | 3 +++ app/handlers/nexus.py | 10 +++++++++- app/loaders/__init__.py | 21 ++++++++++++++++++--- app/loaders/loaders.py | 26 ++++++++++++++++++++++---- app/tests/test_document_loader.py | 26 ++++++++++++++++++++++++++ app/util.py | 4 ++-- poetry.lock | 13 ++++++++----- 7 files changed, 88 insertions(+), 15 deletions(-) diff --git a/Dockerfile b/Dockerfile index ddb6f86..6e3ea55 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,6 +9,9 @@ COPY pyproject.toml poetry.lock ./ RUN poetry config virtualenvs.create false && \ poetry install --no-dev +RUN apt update && apt install libmagic1 -y +RUN poetry add python-magic + COPY . . EXPOSE 8000 diff --git a/app/handlers/nexus.py b/app/handlers/nexus.py index 1ea94d8..e598795 100644 --- a/app/handlers/nexus.py +++ b/app/handlers/nexus.py @@ -16,10 +16,18 @@ def __init__(self) -> None: def index_succedded(self, task_succeded: bool, nexus_task_uuid: str, file_type: str) -> None: endpoint = f'{self.base_url}/api/v1/content-base-file' + + if file_type == "txt": + ftype = "text" + elif file_type == "urls": + ftype = "link" + else: + ftype = "file" + data = { "status": int(task_succeded), "task_uuid": nexus_task_uuid, - "file_type": "text" if file_type == "txt" else "file", + "file_type": ftype, } response = requests.patch(url=endpoint, data=json.dumps(data), headers=self.headers) response.raise_for_status() diff --git a/app/loaders/__init__.py b/app/loaders/__init__.py index a119461..4c32adb 100644 --- a/app/loaders/__init__.py +++ b/app/loaders/__init__.py @@ -1,9 +1,24 @@ import os -from app.loaders.loaders import (DataLoader, txt_loader, pdf_loader, docx_loader, xlsx_loader, DataLoaderCls, PDFLoader, DocxLoader, TxtLoader, XlsxLoader) +from app.loaders.loaders import ( + DataLoader, + txt_loader, + pdf_loader, + docx_loader, + xlsx_loader, +) +from app.loaders.loaders import ( + DataLoaderCls, + PDFLoader, + DocxLoader, + TxtLoader, + XlsxLoader, + URLsLoader, +) from langchain.schema.document import Document from typing import List from app.text_splitters import ITextSplitter + supported_loaders = { 'txt': txt_loader, 'pdf': pdf_loader, @@ -20,9 +35,9 @@ 'txt': TxtLoader, 'xlsx': XlsxLoader, 'xls': XlsxLoader, + 'urls': URLsLoader, } - def load_file_and_get_raw_text(file_name: str, file_type: str) -> str: file_path = f'{os.environ.get("FILE_PATH")}/{file_name}' loader = supported_loaders.get(file_type) @@ -45,4 +60,4 @@ def load_file_url_and_get_pages_text(file_url: str, file_type: str) -> List[Docu def load_file_url_and_split_text(file_url: str, file_type: str, text_splitter: ITextSplitter) -> List[Document]: loader = supported_loaders_cls.get(file_type) data_loader = DataLoaderCls(loader, file_url) - return data_loader.load_and_split_text(text_splitter) \ No newline at end of file + return data_loader.load_and_split_text(text_splitter) diff --git a/app/loaders/loaders.py b/app/loaders/loaders.py index bffe77a..4fb1cc2 100644 --- a/app/loaders/loaders.py +++ b/app/loaders/loaders.py @@ -5,10 +5,10 @@ from langchain.document_loaders import ( TextLoader, PyPDFLoader, UnstructuredExcelLoader, - UnstructuredWordDocumentLoader, Docx2txtLoader + UnstructuredWordDocumentLoader, Docx2txtLoader, UnstructuredURLLoader ) from langchain.schema.document import Document -from typing import Callable, List +from typing import Callable, List, Union from app.text_splitters import ITextSplitter @@ -58,7 +58,7 @@ def txt_loader(file: str) -> Callable: class TxtLoader(DocumentLoader): def _get_file(self, file: str): - if os.environ.get("AWS_STORAGE_BUCKET_NAME") in file: + if os.environ.get("AWS_STORAGE_BUCKET_NAME") in file: # pragma: no cover response = requests.get(file) if response.status_code == 200: file_path = f"/tmp/{uuid.uuid4()}.txt" @@ -172,7 +172,7 @@ def __init__(self, file:str) -> None: tmp_file, _ = self._get_temp_file(file) self.loader = UnstructuredExcelLoader(tmp_file, mode="single") - def _get_temp_file(self, file_url: str): + def _get_temp_file(self, file_url: str): # pragma: no cover result = urlparse(file_url) filename = result.path.strip("/") file_path, message = urlretrieve(file_url, f"/tmp/{filename}") @@ -194,3 +194,21 @@ def load_and_split_text(self, text_splitter: ITextSplitter) -> List[Document]: for chunk in text_chunks: split_pages.append(Document(page_content=chunk, metadata=metadatas)) return split_pages + + +class URLsLoader(DocumentLoader): + def _urls(self, urls: Union[List[str], str]): + if isinstance(urls, str): + return [urls] + return urls + + def __init__(self, urls: Union[List[str], str]) -> None: + self.urls = self._urls(urls) + self.loader = UnstructuredURLLoader(urls=self.urls) + + def load(self) -> List[Document]: + return self.loader.load() + + def load_and_split_text(self, text_splitter: ITextSplitter) -> List[Document]: + docs = self.loader.load_and_split(text_splitter) + return docs diff --git a/app/tests/test_document_loader.py b/app/tests/test_document_loader.py index bc6a96d..262e325 100644 --- a/app/tests/test_document_loader.py +++ b/app/tests/test_document_loader.py @@ -5,6 +5,8 @@ PDFLoader, DocxLoader, TxtLoader, + URLsLoader, + XlsxLoader, pdf_loader, txt_loader, docx_loader, @@ -122,11 +124,35 @@ def test_load_xlsx(self): raw_text = data_loader.raw_text() self.assertEqual(type(raw_text), str) + @mock.patch("app.loaders.loaders.XlsxLoader._get_temp_file") + def test_load_xlsx_cls(self, mock_file_url): + file_path = f'{self.path}/{self.file_name}.xlsx' + mock_file_url.return_value = (file_path, "") + xlsx_loader = XlsxLoader(file_path) + split_pages: List[Document] = xlsx_loader.load_and_split_text(self.text_splitter) + self.assertEqual(list, type(split_pages)) + def test_pdf_loader_cls(self): file_path = f'{self.path}/{self.file_name}.pdf' pdf_loader = PDFLoader(file_path) split_pages: List[Document] = pdf_loader.load_and_split_text(self.text_splitter) self.assertEqual(list, type(split_pages)) + + def test_urls_loader_cls(self): + urls_loader = URLsLoader("https://en.wikipedia.org/wiki/Unit_testing") + split_pages: List[Document] = urls_loader.load() + self.assertEqual(list, type(split_pages)) + + def test_urls_loader_and_split_cls(self): + urls_loader = URLsLoader("https://en.wikipedia.org/wiki/Unit_testing") + split_pages: List[Document] = urls_loader.load_and_split_text(self.text_splitter) + self.assertEqual(list, type(split_pages)) + + def test_urls_list_loader_and_split_cls(self): + urls = ["https://en.wikipedia.org/wiki/Unit_testing"] + urls_loader = URLsLoader(urls) + split_pages: List[Document] = urls_loader.load_and_split_text(self.text_splitter) + self.assertEqual(list, type(split_pages)) def test_docx_loader_cls(self): file_path = f'{self.path}/{self.file_name}.docx' diff --git a/app/util.py b/app/util.py index a4c0aa7..3f148f0 100644 --- a/app/util.py +++ b/app/util.py @@ -7,11 +7,11 @@ class ContentHandler(EmbeddingsContentHandler): content_type = "application/json" accepts = "application/json" - def transform_input(self, inputs: list[str], model_kwargs: dict) -> bytes: + def transform_input(self, inputs: list[str], model_kwargs: dict) -> bytes: # pragma: no cover input_str = json.dumps({"inputs": inputs, **model_kwargs}) return input_str.encode("utf-8") - def transform_output(self, output: bytes) -> list[list[float]]: + def transform_output(self, output: bytes) -> list[list[float]]: # pragma: no cover response_json = json.loads(output.read().decode("utf-8")) return response_json["vectors"] diff --git a/poetry.lock b/poetry.lock index 94f424d..1c5532a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.7.0 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. [[package]] name = "aiohttp" @@ -1175,6 +1175,7 @@ description = "Powerful and Pythonic XML processing library combining libxml2/li optional = false python-versions = ">=3.6" files = [ + {file = "lxml-5.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:704f5572ff473a5f897745abebc6df40f22d4133c1e0a1f124e4f2bd3330ff7e"}, {file = "lxml-5.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9d3c0f8567ffe7502d969c2c1b809892dc793b5d0665f602aad19895f8d508da"}, {file = "lxml-5.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5fcfbebdb0c5d8d18b84118842f31965d59ee3e66996ac842e21f957eb76138c"}, {file = "lxml-5.1.0-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2f37c6d7106a9d6f0708d4e164b707037b7380fcd0b04c5bd9cae1fb46a856fb"}, @@ -1184,6 +1185,7 @@ files = [ {file = "lxml-5.1.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:82bddf0e72cb2af3cbba7cec1d2fd11fda0de6be8f4492223d4a268713ef2147"}, {file = "lxml-5.1.0-cp310-cp310-win32.whl", hash = "sha256:b66aa6357b265670bb574f050ffceefb98549c721cf28351b748be1ef9577d93"}, {file = "lxml-5.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:4946e7f59b7b6a9e27bef34422f645e9a368cb2be11bf1ef3cafc39a1f6ba68d"}, + {file = "lxml-5.1.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:14deca1460b4b0f6b01f1ddc9557704e8b365f55c63070463f6c18619ebf964f"}, {file = "lxml-5.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ed8c3d2cd329bf779b7ed38db176738f3f8be637bb395ce9629fc76f78afe3d4"}, {file = "lxml-5.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:436a943c2900bb98123b06437cdd30580a61340fbdb7b28aaf345a459c19046a"}, {file = "lxml-5.1.0-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:acb6b2f96f60f70e7f34efe0c3ea34ca63f19ca63ce90019c6cbca6b676e81fa"}, @@ -1193,6 +1195,7 @@ files = [ {file = "lxml-5.1.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:f4c9bda132ad108b387c33fabfea47866af87f4ea6ffb79418004f0521e63204"}, {file = "lxml-5.1.0-cp311-cp311-win32.whl", hash = "sha256:bc64d1b1dab08f679fb89c368f4c05693f58a9faf744c4d390d7ed1d8223869b"}, {file = "lxml-5.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:a5ab722ae5a873d8dcee1f5f45ddd93c34210aed44ff2dc643b5025981908cda"}, + {file = "lxml-5.1.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:9aa543980ab1fbf1720969af1d99095a548ea42e00361e727c58a40832439114"}, {file = "lxml-5.1.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:6f11b77ec0979f7e4dc5ae081325a2946f1fe424148d3945f943ceaede98adb8"}, {file = "lxml-5.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a36c506e5f8aeb40680491d39ed94670487ce6614b9d27cabe45d94cd5d63e1e"}, {file = "lxml-5.1.0-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f643ffd2669ffd4b5a3e9b41c909b72b2a1d5e4915da90a77e119b8d48ce867a"}, @@ -1218,8 +1221,8 @@ files = [ {file = "lxml-5.1.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:8f52fe6859b9db71ee609b0c0a70fea5f1e71c3462ecf144ca800d3f434f0764"}, {file = "lxml-5.1.0-cp37-cp37m-win32.whl", hash = "sha256:d42e3a3fc18acc88b838efded0e6ec3edf3e328a58c68fbd36a7263a874906c8"}, {file = "lxml-5.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:eac68f96539b32fce2c9b47eb7c25bb2582bdaf1bbb360d25f564ee9e04c542b"}, + {file = "lxml-5.1.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:ae15347a88cf8af0949a9872b57a320d2605ae069bcdf047677318bc0bba45b1"}, {file = "lxml-5.1.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:c26aab6ea9c54d3bed716b8851c8bfc40cb249b8e9880e250d1eddde9f709bf5"}, - {file = "lxml-5.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:cfbac9f6149174f76df7e08c2e28b19d74aed90cad60383ad8671d3af7d0502f"}, {file = "lxml-5.1.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:342e95bddec3a698ac24378d61996b3ee5ba9acfeb253986002ac53c9a5f6f84"}, {file = "lxml-5.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:725e171e0b99a66ec8605ac77fa12239dbe061482ac854d25720e2294652eeaa"}, {file = "lxml-5.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3d184e0d5c918cff04cdde9dbdf9600e960161d773666958c9d7b565ccc60c45"}, @@ -1227,6 +1230,7 @@ files = [ {file = "lxml-5.1.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:6d48fc57e7c1e3df57be5ae8614bab6d4e7b60f65c5457915c26892c41afc59e"}, {file = "lxml-5.1.0-cp38-cp38-win32.whl", hash = "sha256:7ec465e6549ed97e9f1e5ed51c657c9ede767bc1c11552f7f4d022c4df4a977a"}, {file = "lxml-5.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:b21b4031b53d25b0858d4e124f2f9131ffc1530431c6d1321805c90da78388d1"}, + {file = "lxml-5.1.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:52427a7eadc98f9e62cb1368a5079ae826f94f05755d2d567d93ee1bc3ceb354"}, {file = "lxml-5.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6a2a2c724d97c1eb8cf966b16ca2915566a4904b9aad2ed9a09c748ffe14f969"}, {file = "lxml-5.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:843b9c835580d52828d8f69ea4302537337a21e6b4f1ec711a52241ba4a824f3"}, {file = "lxml-5.1.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9b99f564659cfa704a2dd82d0684207b1aadf7d02d33e54845f9fc78e06b7581"}, @@ -2049,6 +2053,7 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, @@ -2956,7 +2961,5 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" - python-versions = ">=3.10,<3.13" - -content-hash = "f8d8c0c240311fcb449d2f2b86f21dec403402589d53549276b560ab51578a80" +content-hash = "0e6d5093c842e81dfb6223736bf05146fd0c1a125c089ad006505d876b135eba" From 72b8b2920df0a453ddf7707e2072a2796f8982e0 Mon Sep 17 00:00:00 2001 From: Alisson Date: Mon, 4 Mar 2024 12:49:10 -0300 Subject: [PATCH 2/7] fix txt test --- app/tests/test_document_loader.py | 1 + 1 file changed, 1 insertion(+) diff --git a/app/tests/test_document_loader.py b/app/tests/test_document_loader.py index 262e325..37d0862 100644 --- a/app/tests/test_document_loader.py +++ b/app/tests/test_document_loader.py @@ -160,6 +160,7 @@ def test_docx_loader_cls(self): split_pages: List[Document] = docx_loader.load_and_split_text(self.text_splitter) self.assertEqual(list, type(split_pages)) + @mock.patch.dict(os.environ, {"AWS_STORAGE_BUCKET_NAME": "file-path"}) def test_txt_loader_cls(self): file_path = f'{self.path}/{self.file_name}.txt' docx_loader = TxtLoader(file_path) From bc29ee860b89a6ea280b9e3a9ee06a001bfb17ce Mon Sep 17 00:00:00 2001 From: Alisson Date: Mon, 4 Mar 2024 14:23:48 -0300 Subject: [PATCH 3/7] add xlrd missing dependency --- poetry.lock | 18 +++++++++++++++++- pyproject.toml | 1 + 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/poetry.lock b/poetry.lock index 1c5532a..0f2bbcd 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2845,6 +2845,22 @@ files = [ {file = "wcwidth-0.2.13.tar.gz", hash = "sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5"}, ] +[[package]] +name = "xlrd" +version = "2.0.1" +description = "Library for developers to extract data from Microsoft Excel (tm) .xls spreadsheet files" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" +files = [ + {file = "xlrd-2.0.1-py2.py3-none-any.whl", hash = "sha256:6a33ee89877bd9abc1158129f6e94be74e2679636b8a205b43b85206c3f0bbdd"}, + {file = "xlrd-2.0.1.tar.gz", hash = "sha256:f72f148f54442c6b056bf931dbc34f986fd0c3b0b6b5a58d013c9aef274d0c88"}, +] + +[package.extras] +build = ["twine", "wheel"] +docs = ["sphinx"] +test = ["pytest", "pytest-cov"] + [[package]] name = "xlsxwriter" version = "3.1.9" @@ -2962,4 +2978,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.13" -content-hash = "0e6d5093c842e81dfb6223736bf05146fd0c1a125c089ad006505d876b135eba" +content-hash = "ef2a8fe841c241406573f67e3b818b0fdbfe71a059fa9c3f59ca3b548a6e4ca8" diff --git a/pyproject.toml b/pyproject.toml index 389f2ae..f1c7222 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,6 +27,7 @@ pydantic = "2.3.0" celery = "^5.3.6" redis = "^5.0.1" sentry-sdk = {extras = ["fastapi"], version = "^1.35.0"} +xlrd = "^2.0.1" [tool.poetry.group.dev.dependencies] From 30e96b42b1865f28dbe61701efc9c9e90ee6ed14 Mon Sep 17 00:00:00 2001 From: Alisson Date: Tue, 12 Mar 2024 15:37:46 -0300 Subject: [PATCH 4/7] add punkt --- Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile b/Dockerfile index 6e3ea55..363a72f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,6 +11,7 @@ RUN poetry config virtualenvs.create false && \ RUN apt update && apt install libmagic1 -y RUN poetry add python-magic +RUN python -m nltk.downloader punkt COPY . . From 011869f57076f9d44d66a37a958f4a765814d2ab Mon Sep 17 00:00:00 2001 From: Alisson Date: Tue, 12 Mar 2024 16:09:14 -0300 Subject: [PATCH 5/7] add directory --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 363a72f..54e31ae 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,7 +11,7 @@ RUN poetry config virtualenvs.create false && \ RUN apt update && apt install libmagic1 -y RUN poetry add python-magic -RUN python -m nltk.downloader punkt +RUN python -m nltk.downloader punkt -d /usr/share/nltk_data COPY . . From 473c2b9b9b10af5bfffad1ede6f2cdf8dead0ed6 Mon Sep 17 00:00:00 2001 From: Alisson Date: Wed, 13 Mar 2024 09:40:36 -0300 Subject: [PATCH 6/7] add averaged_perceptron_tagger package --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 54e31ae..8eadd8c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,7 +11,7 @@ RUN poetry config virtualenvs.create false && \ RUN apt update && apt install libmagic1 -y RUN poetry add python-magic -RUN python -m nltk.downloader punkt -d /usr/share/nltk_data +RUN python -m nltk.downloader punkt averaged_perceptron_tagger -d /usr/share/nltk_data COPY . . From f71a523214296735803157706605fe5fbb3efd2f Mon Sep 17 00:00:00 2001 From: Alisson Date: Wed, 13 Mar 2024 15:57:30 -0300 Subject: [PATCH 7/7] split url text into pages --- app/loaders/loaders.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/app/loaders/loaders.py b/app/loaders/loaders.py index 4fb1cc2..aad8fc3 100644 --- a/app/loaders/loaders.py +++ b/app/loaders/loaders.py @@ -210,5 +210,15 @@ def load(self) -> List[Document]: return self.loader.load() def load_and_split_text(self, text_splitter: ITextSplitter) -> List[Document]: - docs = self.loader.load_and_split(text_splitter) - return docs + split_pages = [] + + pages = self.loader.load_and_split() + for page in pages: + page_content = page.page_content.lower() + metadatas = page.metadata + metadatas.update({"full_page": page_content}) + + text_chunks = text_splitter.split_text(page_content) + for chunk in text_chunks: + split_pages.append(Document(page_content=chunk, metadata=metadatas)) + return split_pages