-
Notifications
You must be signed in to change notification settings - Fork 136
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Support Dataprep Microservice with Llama Index (#154)
* move file to langchain folder Signed-off-by: letonghan <letong.han@intel.com> * support dataprep with llama_index Signed-off-by: letonghan <letong.han@intel.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add e2e test script Signed-off-by: letonghan <letong.han@intel.com> * update test script name Signed-off-by: letonghan <letong.han@intel.com> --------- Signed-off-by: letonghan <letong.han@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
- Loading branch information
1 parent
0dedc28
commit f7443f2
Showing
18 changed files
with
312 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
import os | ||
|
||
EMBED_MODEL = os.getenv("EMBED_MODEL", "BAAI/bge-base-en-v1.5") | ||
|
||
# Redis Connection Information | ||
REDIS_HOST = os.getenv("REDIS_HOST", "localhost") | ||
REDIS_PORT = int(os.getenv("REDIS_PORT", 6379)) | ||
|
||
|
||
def get_boolean_env_var(var_name, default_value=False): | ||
"""Retrieve the boolean value of an environment variable. | ||
Args: | ||
var_name (str): The name of the environment variable to retrieve. | ||
default_value (bool): The default value to return if the variable | ||
is not found. | ||
Returns: | ||
bool: The value of the environment variable, interpreted as a boolean. | ||
""" | ||
true_values = {"true", "1", "t", "y", "yes"} | ||
false_values = {"false", "0", "f", "n", "no"} | ||
|
||
# Retrieve the environment variable's value | ||
value = os.getenv(var_name, "").lower() | ||
|
||
# Decide the boolean value based on the content of the string | ||
if value in true_values: | ||
return True | ||
elif value in false_values: | ||
return False | ||
else: | ||
return default_value | ||
|
||
|
||
def format_redis_conn_from_env(): | ||
redis_url = os.getenv("REDIS_URL", None) | ||
if redis_url: | ||
return redis_url | ||
else: | ||
using_ssl = get_boolean_env_var("REDIS_SSL", False) | ||
start = "rediss://" if using_ssl else "redis://" | ||
|
||
# if using RBAC | ||
password = os.getenv("REDIS_PASSWORD", None) | ||
username = os.getenv("REDIS_USERNAME", "default") | ||
if password is not None: | ||
start += f"{username}:{password}@" | ||
|
||
return start + f"{REDIS_HOST}:{REDIS_PORT}" | ||
|
||
|
||
INDEX_NAME = os.getenv("INDEX_NAME", "rag-redis") | ||
REDIS_URL = format_redis_conn_from_env() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
|
||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
FROM ubuntu:22.04 | ||
|
||
RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ | ||
libgl1-mesa-glx \ | ||
libjemalloc-dev \ | ||
vim \ | ||
python3 \ | ||
python3-pip | ||
|
||
RUN useradd -m -s /bin/bash user && \ | ||
mkdir -p /home/user && \ | ||
chown -R user /home/user/ | ||
|
||
USER user | ||
|
||
COPY comps /home/user/comps | ||
|
||
RUN pip install --no-cache-dir --upgrade pip && \ | ||
pip install --no-cache-dir -r /home/user/comps/dataprep/redis/llama_index/requirements.txt | ||
|
||
ENV PYTHONPATH=$PYTHONPATH:/home/user | ||
|
||
WORKDIR /home/user/comps/dataprep/redis/llama_index | ||
|
||
ENTRYPOINT ["python3", "prepare_doc_redis.py"] | ||
|
28 changes: 28 additions & 0 deletions
28
comps/dataprep/redis/llama_index/docker/docker-compose-dataprep-redis.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
version: "3" | ||
services: | ||
redis-vector-db: | ||
image: redis/redis-stack:7.2.0-v9 | ||
container_name: redis-vector-db | ||
ports: | ||
- "6379:6379" | ||
- "8001:8001" | ||
dataprep-redis: | ||
image: opea/dataprep-redis:latest | ||
container_name: dataprep-redis-server | ||
ports: | ||
- "6007:6007" | ||
ipc: host | ||
environment: | ||
http_proxy: ${http_proxy} | ||
https_proxy: ${https_proxy} | ||
REDIS_URL: ${REDIS_URL} | ||
INDEX_NAME: ${INDEX_NAME} | ||
LANGCHAIN_API_KEY: ${LANGCHAIN_API_KEY} | ||
restart: unless-stopped | ||
|
||
networks: | ||
default: | ||
driver: bridge |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
import json | ||
import os | ||
from pathlib import Path | ||
from typing import List, Optional, Union | ||
|
||
from config import EMBED_MODEL, INDEX_NAME, REDIS_URL | ||
from fastapi import File, Form, HTTPException, UploadFile | ||
from langsmith import traceable | ||
from llama_index.core import SimpleDirectoryReader, StorageContext, VectorStoreIndex | ||
from llama_index.core.settings import Settings | ||
from llama_index.embeddings.huggingface import HuggingFaceEmbedding | ||
from llama_index.vector_stores.redis import RedisVectorStore | ||
from redis import Redis | ||
from redisvl.schema import IndexSchema | ||
|
||
from comps import DocPath, opea_microservices, register_microservice | ||
|
||
|
||
async def save_file_to_local_disk(save_path: str, file): | ||
save_path = Path(save_path) | ||
with save_path.open("wb") as fout: | ||
try: | ||
content = await file.read() | ||
fout.write(content) | ||
except Exception as e: | ||
print(f"Write file failed. Exception: {e}") | ||
raise HTTPException(status_code=500, detail=f"Write file {save_path} failed. Exception: {e}") | ||
|
||
|
||
async def ingest_data_to_redis(doc_path: DocPath): | ||
embedder = HuggingFaceEmbedding(model_name=EMBED_MODEL) | ||
print(f"embedder: {embedder}") | ||
Settings.embed_model = embedder | ||
doc_path = doc_path.path | ||
content = SimpleDirectoryReader(input_files=[doc_path]).load_data() | ||
redis_client = Redis.from_url(REDIS_URL) | ||
schema = IndexSchema.from_dict( | ||
{ | ||
"index": {"name": INDEX_NAME, "prefix": f"doc:{INDEX_NAME}"}, | ||
"fields": [ | ||
{"name": "id", "type": "tag"}, | ||
{"name": "doc_id", "type": "tag"}, | ||
{"name": "text", "type": "text"}, | ||
{"name": "content", "type": "text"}, | ||
{"name": "source", "type": "text"}, | ||
{"name": "start_index", "type": "numeric"}, | ||
{ | ||
"name": "vector", | ||
"type": "vector", | ||
"attrs": {"dims": 768, "algorithm": "HNSW", "date_type": "FLOAT32"}, | ||
}, | ||
], | ||
} | ||
) | ||
vector_store = RedisVectorStore(redis_client=redis_client, schema=schema) | ||
storage_context = StorageContext.from_defaults(vector_store=vector_store) | ||
_ = VectorStoreIndex.from_documents(content, storage_context=storage_context) | ||
print("[ ingest data ] data ingested into Redis DB.") | ||
return True | ||
|
||
|
||
@register_microservice(name="opea_service@prepare_doc_redis", endpoint="/v1/dataprep", host="0.0.0.0", port=6007) | ||
@traceable(run_type="tool") | ||
# llama index only support upload files now | ||
async def ingest_documents(files: Optional[Union[UploadFile, List[UploadFile]]] = File(None)): | ||
print(f"files:{files}") | ||
if not files: | ||
raise HTTPException(status_code=400, detail="Please provide at least one file.") | ||
|
||
if not isinstance(files, list): | ||
files = [files] | ||
upload_folder = "./uploaded_files/" | ||
if not os.path.exists(upload_folder): | ||
Path(upload_folder).mkdir(parents=True, exist_ok=True) | ||
try: | ||
for file in files: | ||
save_path = upload_folder + file.filename | ||
await save_file_to_local_disk(save_path, file) | ||
await ingest_data_to_redis(DocPath(path=save_path)) | ||
print(f"Successfully saved file {save_path}") | ||
return {"status": 200, "message": "Data preparation succeeded"} | ||
except Exception as e: | ||
print(f"Data preparation failed. Exception: {e}") | ||
raise HTTPException(status_code=500, detail=f"Data preparation failed. Exception: {e}") | ||
|
||
|
||
if __name__ == "__main__": | ||
opea_microservices["opea_service@prepare_doc_redis"].start() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
docarray[full] | ||
fastapi | ||
huggingface_hub | ||
langsmith | ||
llama-index | ||
llama-index-embeddings-huggingface==0.2.0 | ||
llama-index-readers-file | ||
llama-index-vector-stores-redis | ||
numpy | ||
opentelemetry-api | ||
opentelemetry-exporter-otlp | ||
opentelemetry-sdk | ||
redis | ||
sentence_transformers | ||
shortuuid |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
#!/bin/bash | ||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
set -xe | ||
|
||
WORKPATH=$(dirname "$PWD") | ||
LOG_PATH="$WORKPATH/tests" | ||
ip_address=$(hostname -I | awk '{print $1}') | ||
|
||
function build_docker_images() { | ||
cd $WORKPATH | ||
echo $(pwd) | ||
docker build --no-cache -t opea/dataprep-redis:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/redis/llama_index/docker/Dockerfile . | ||
} | ||
|
||
function start_service() { | ||
docker run -d --name="test-comps-dataprep-redis" -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p 6379:6379 -p 8001:8001 --ipc=host redis/redis-stack:7.2.0-v9 | ||
dataprep_service_port=5011 | ||
REDIS_URL="redis://${ip_address}:6379" | ||
docker run -d --name="test-comps-dataprep-redis-server" -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e REDIS_URL=$REDIS_URL -p ${dataprep_service_port}:6007 --ipc=host opea/dataprep-redis:comps | ||
sleep 1m | ||
} | ||
|
||
function validate_microservice() { | ||
dataprep_service_port=5011 | ||
URL="http://${ip_address}:$dataprep_service_port/v1/dataprep" | ||
echo "Deep learning is a subset of machine learning that utilizes neural networks with multiple layers to analyze various levels of abstract data representations. It enables computers to identify patterns and make decisions with minimal human intervention by learning from large amounts of data." > ./dataprep_file.txt | ||
HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -F 'files=@./dataprep_file.txt' -H 'Content-Type: multipart/form-data' "$URL") | ||
if [ "$HTTP_STATUS" -eq 200 ]; then | ||
echo "[ dataprep ] HTTP status is 200. Checking content..." | ||
local CONTENT=$(curl -s -X POST -F 'files=@./dataprep_file.txt' -H 'Content-Type: multipart/form-data' "$URL" | tee ${LOG_PATH}/dataprep.log) | ||
|
||
if echo 'Data preparation succeeded' | grep -q "$EXPECTED_RESULT"; then | ||
echo "[ dataprep ] Content is as expected." | ||
else | ||
echo "[ dataprep ] Content does not match the expected result: $CONTENT" | ||
docker logs test-comps-dataprep-redis-server >> ${LOG_PATH}/dataprep.log | ||
exit 1 | ||
fi | ||
else | ||
echo "[ dataprep ] HTTP status is not 200. Received status was $HTTP_STATUS" | ||
docker logs test-comps-dataprep-redis-server >> ${LOG_PATH}/dataprep.log | ||
exit 1 | ||
fi | ||
} | ||
|
||
function stop_docker() { | ||
cid=$(docker ps -aq --filter "name=test-comps-dataprep-redis*") | ||
if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi | ||
} | ||
|
||
function main() { | ||
|
||
stop_docker | ||
|
||
build_docker_images | ||
start_service | ||
|
||
validate_microservice | ||
|
||
stop_docker | ||
echo y | docker system prune | ||
|
||
} | ||
|
||
main |