Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Provide Indexer to index files #27

Merged
merged 12 commits into from
Nov 21, 2024
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,8 @@ public void indexFile(
+ "/data_sources/"
+ ragDocument.dataSourceId()
+ "/documents/download-and-index",
new IndexRequest(bucketName, ragDocument.s3Path(), configuration));
new IndexRequest(
ragDocument.documentId(), bucketName, ragDocument.s3Path(), configuration));
} catch (IOException e) {
throw new RuntimeException(e);
}
Expand Down Expand Up @@ -97,6 +98,7 @@ public void deleteSession(Long sessionId) {
}

record IndexRequest(
@JsonProperty("document_id") String documentId,
@JsonProperty("s3_bucket_name") String s3BucketName,
@JsonProperty("s3_document_key") String s3DocumentKey,
IndexConfiguration configuration) {}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ void indexFile() {
Tracker<TrackedHttpRequest<?>> tracker = new Tracker<>();
RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker));
IndexConfiguration indexConfiguration = new IndexConfiguration(123, 2);
RagDocument document = indexRequest("s3Path", 1234L);
RagDocument document = indexRequest("documentId", "s3Path", 1234L);

client.indexFile(document, "bucketName", indexConfiguration);

Expand All @@ -68,14 +68,15 @@ void indexFile() {
new TrackedHttpRequest<>(
HttpMethod.POST,
"http://rag-backend:8000/data_sources/" + 1234L + "/documents/download-and-index",
new RagBackendClient.IndexRequest("bucketName", "s3Path", indexConfiguration)));
new RagBackendClient.IndexRequest(
"documentId", "bucketName", "s3Path", indexConfiguration)));
}

@Test
void createSummary() {
Tracker<TrackedHttpRequest<?>> tracker = new Tracker<>();
RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker));
RagDocument document = indexRequest("s3Path", 1234L);
RagDocument document = indexRequest("documentId", "s3Path", 1234L);

client.createSummary(document, "bucketName");

Expand Down Expand Up @@ -134,13 +135,25 @@ void deleteSession() {
void null_handlesThrowable() {
RagBackendClient client =
RagBackendClient.createNull(new Tracker<>(), new NotFound("not found"));
RagDocument document = indexRequest("s3Path", 1234L);
RagDocument document = indexRequest("documentId", "s3Path", 1234L);
assertThatThrownBy(() -> client.indexFile(document, "fakeit", null))
.isInstanceOf(NotFound.class);
}

private static RagDocument indexRequest(String s3Path, Long dataSourceId) {
private static RagDocument indexRequest(String documentId, String s3Path, Long dataSourceId) {
return new RagDocument(
null, null, dataSourceId, null, s3Path, null, null, null, null, null, null, null, null);
null,
null,
dataSourceId,
documentId,
s3Path,
null,
null,
null,
null,
null,
null,
null,
null);
}
}
37 changes: 37 additions & 0 deletions llm-service/app/ai/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#
# CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
# (C) Cloudera, Inc. 2024
# All rights reserved.
#
# Applicable Open Source License: Apache 2.0
#
# NOTE: Cloudera open source products are modular software products
# made up of hundreds of individual components, each of which was
# individually copyrighted. Each Cloudera open source product is a
# collective work under U.S. Copyright Law. Your license to use the
# collective work is as provided in your written agreement with
# Cloudera. Used apart from the collective work, this file is
# licensed for your use pursuant to the open source license
# identified above.
#
# This code is provided to you pursuant a written agreement with
# (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
# this code. If you do not have a written agreement with Cloudera nor
# with an authorized and properly licensed third party, you do not
# have any rights to access nor to use this code.
#
# Absent a written agreement with Cloudera, Inc. ("Cloudera") to the
# contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
# KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
# WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
# IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
# FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
# AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
# ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
# OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
# CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
# RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
# BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
# DATA.
#
37 changes: 37 additions & 0 deletions llm-service/app/ai/indexing/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#
# CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
# (C) Cloudera, Inc. 2024
# All rights reserved.
#
# Applicable Open Source License: Apache 2.0
#
# NOTE: Cloudera open source products are modular software products
# made up of hundreds of individual components, each of which was
# individually copyrighted. Each Cloudera open source product is a
# collective work under U.S. Copyright Law. Your license to use the
# collective work is as provided in your written agreement with
# Cloudera. Used apart from the collective work, this file is
# licensed for your use pursuant to the open source license
# identified above.
#
# This code is provided to you pursuant a written agreement with
# (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
# this code. If you do not have a written agreement with Cloudera nor
# with an authorized and properly licensed third party, you do not
# have any rights to access nor to use this code.
#
# Absent a written agreement with Cloudera, Inc. ("Cloudera") to the
# contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
# KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
# WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
# IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
# FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
# AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
# ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
# OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
# CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
# RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
# BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
# DATA.
#
131 changes: 131 additions & 0 deletions llm-service/app/ai/indexing/index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
#
# CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
# (C) Cloudera, Inc. 2024
# All rights reserved.
#
# Applicable Open Source License: Apache 2.0
#
# NOTE: Cloudera open source products are modular software products
# made up of hundreds of individual components, each of which was
# individually copyrighted. Each Cloudera open source product is a
# collective work under U.S. Copyright Law. Your license to use the
# collective work is as provided in your written agreement with
# Cloudera. Used apart from the collective work, this file is
# licensed for your use pursuant to the open source license
# identified above.
#
# This code is provided to you pursuant a written agreement with
# (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
# this code. If you do not have a written agreement with Cloudera nor
# with an authorized and properly licensed third party, you do not
# have any rights to access nor to use this code.
#
# Absent a written agreement with Cloudera, Inc. ("Cloudera") to the
# contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
# KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
# WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
# IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
# FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
# AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
# ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
# OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
# CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
# RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
# BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
# DATA.
#

from dataclasses import dataclass
import logging
import os
from typing import Dict, List, Type

from .readers.pdf import PDFReader
from .readers.nop import NopReader
from llama_index.core.readers.base import BaseReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import Document
from llama_index.core.base.embeddings.base import BaseEmbedding
from ...services.vector_store import VectorStore
from llama_index.core.node_parser.interface import BaseNode

logger = logging.getLogger(__name__)

READERS: Dict[str, Type[BaseReader]] = {
".pdf": PDFReader,
".txt": NopReader,
".md": NopReader,
}
CHUNKABLE_FILE_EXTENSIONS = set(
[
".pdf",
".txt",
".md",
]
)

@dataclass
class NotSupportedFileExtensionError(Exception):
file_extension: str

class Indexer:
def __init__(self, data_source_id: int, splitter: SentenceSplitter, embedding_model: BaseEmbedding, chunks_vector_store: VectorStore):
self.data_source_id = data_source_id
self.splitter = splitter
self.embedding_model = embedding_model
self.chunks_vector_store = chunks_vector_store

def index_file(self, file_path: str, file_id: str):
jkwatson marked this conversation as resolved.
Show resolved Hide resolved
logger.debug(f"Indexing file: {file_path}")

file_extension = os.path.splitext(file_path)[1]
reader_cls = READERS.get(file_extension)
if not reader_cls:
raise NotSupportedFileExtensionError(file_extension)

reader = reader_cls()

logger.debug(f"Parsing file: {file_path}")

documents = self._documents_in_file(reader, file_path, file_id)
if file_extension in CHUNKABLE_FILE_EXTENSIONS:
logger.debug(f"Chunking file: {file_path}")
chunks = [chunk for document in documents for chunk in self._chunks_in_document(document)]
else:
chunks = documents

texts = [chunk.text for chunk in chunks]
logger.debug(f"Embedding {len(texts)} chunks")
embeddings = self.embedding_model.get_text_embedding_batch(texts)

for chunk, embedding in zip(chunks, embeddings):
chunk.embedding = embedding
jkwatson marked this conversation as resolved.
Show resolved Hide resolved

logger.debug(f"Adding {len(chunks)} chunks to vector store")
chunks_vector_store = self.chunks_vector_store.access_vector_store()
chunks_vector_store.add(chunks)

logger.debug(f"Indexing file: {file_path} completed")

def _documents_in_file(self, reader: BaseReader, file_path: str, file_id: str) -> List[Document]:
documents = reader.load_data(file_path)

for i, document in enumerate(documents):
# Update the document metadata
document.metadata["file_id"] = file_id
document.metadata["document_part_number"] = i
document.metadata["data_source_id"] = self.data_source_id
jkwatson marked this conversation as resolved.
Show resolved Hide resolved

return documents

def _chunks_in_document(self, document: Document) -> List[BaseNode]:
chunks = self.splitter.get_nodes_from_documents([document])

for j, chunk in enumerate(chunks):
chunk.metadata["file_id"] = document.metadata["file_id"]
chunk.metadata["document_part_number"] = document.metadata["document_part_number"]
chunk.metadata["chunk_number"] = j
chunk.metadata["data_source_id"] = document.metadata["data_source_id"]

return chunks
37 changes: 37 additions & 0 deletions llm-service/app/ai/indexing/readers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#
# CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
# (C) Cloudera, Inc. 2024
# All rights reserved.
#
# Applicable Open Source License: Apache 2.0
#
# NOTE: Cloudera open source products are modular software products
# made up of hundreds of individual components, each of which was
# individually copyrighted. Each Cloudera open source product is a
# collective work under U.S. Copyright Law. Your license to use the
# collective work is as provided in your written agreement with
# Cloudera. Used apart from the collective work, this file is
# licensed for your use pursuant to the open source license
# identified above.
#
# This code is provided to you pursuant a written agreement with
# (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
# this code. If you do not have a written agreement with Cloudera nor
# with an authorized and properly licensed third party, you do not
# have any rights to access nor to use this code.
#
# Absent a written agreement with Cloudera, Inc. ("Cloudera") to the
# contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
# KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
# WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
# IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
# FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
# AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
# ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
# OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
# CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
# RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
# BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
# DATA.
#
46 changes: 46 additions & 0 deletions llm-service/app/ai/indexing/readers/nop.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#
# CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
# (C) Cloudera, Inc. 2024
# All rights reserved.
#
# Applicable Open Source License: Apache 2.0
#
# NOTE: Cloudera open source products are modular software products
# made up of hundreds of individual components, each of which was
# individually copyrighted. Each Cloudera open source product is a
# collective work under U.S. Copyright Law. Your license to use the
# collective work is as provided in your written agreement with
# Cloudera. Used apart from the collective work, this file is
# licensed for your use pursuant to the open source license
# identified above.
#
# This code is provided to you pursuant a written agreement with
# (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
# this code. If you do not have a written agreement with Cloudera nor
# with an authorized and properly licensed third party, you do not
# have any rights to access nor to use this code.
#
# Absent a written agreement with Cloudera, Inc. ("Cloudera") to the
# contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
# KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
# WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
# IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
# FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
# AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
# ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
# OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
# CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
# RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
# BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
# DATA.
#

from typing import List
from llama_index.core.readers.base import BaseReader
from llama_index.core.schema import Document

class NopReader(BaseReader):
def load_data(self, file_path: str) -> List[Document]:
with open(file_path, "r") as f:
return [Document(text=f.read())]
Loading
Loading