diff --git a/comps/dataprep/qdrant/README.md b/comps/dataprep/qdrant/README.md index 24f58fc09..30aefb43d 100644 --- a/comps/dataprep/qdrant/README.md +++ b/comps/dataprep/qdrant/README.md @@ -47,7 +47,7 @@ docker build -t opea/dataprep-qdrant:latest --build-arg https_proxy=$https_proxy ## Run Docker with CLI ```bash -docker run -d --name="dataprep-qdrant-server" -p 6000:6000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/dataprep-qdrant:latest +docker run -d --name="dataprep-qdrant-server" -p 6007:6007 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/dataprep-qdrant:latest ``` ## Setup Environment Variables @@ -55,7 +55,7 @@ docker run -d --name="dataprep-qdrant-server" -p 6000:6000 --ipc=host -e http_pr ```bash export http_proxy=${your_http_proxy} export https_proxy=${your_http_proxy} -export QDRANT=${host_ip} +export QDRANT_HOST=${host_ip} export QDRANT_PORT=6333 export COLLECTION_NAME=${your_collection_name} ``` @@ -72,13 +72,21 @@ docker compose -f docker-compose-dataprep-qdrant.yaml up -d Once document preparation microservice for Qdrant is started, user can use below command to invoke the microservice to convert the document to embedding and save to the database. ```bash -curl -X POST -H "Content-Type: application/json" -d '{"path":"/path/to/document"}' http://localhost:6000/v1/dataprep +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./file1.txt" \ + http://localhost:6007/v1/dataprep ``` You can specify chunk_size and chunk_size by the following commands. ```bash -curl -X POST -H "Content-Type: application/json" -d '{"path":"/path/to/document","chunk_size":1500,"chunk_overlap":100}' http://localhost:6000/v1/dataprep +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./file1.txt" \ + -F "chunk_size=1500" \ + -F "chunk_overlap=100" \ + http://localhost:6007/v1/dataprep ``` We support table extraction from pdf documents. You can specify process_table and table_strategy by the following commands. "table_strategy" refers to the strategies to understand tables for table retrieval. As the setting progresses from "fast" to "hq" to "llm," the focus shifts towards deeper table understanding at the expense of processing speed. The default strategy is "fast". @@ -86,5 +94,10 @@ We support table extraction from pdf documents. You can specify process_table an Note: If you specify "table_strategy=llm", You should first start TGI Service, please refer to 1.2.1, 1.3.1 in https://github.com/opea-project/GenAIComps/tree/main/comps/llms/README.md, and then `export TGI_LLM_ENDPOINT="http://${your_ip}:8008"`. ```bash -curl -X POST -H "Content-Type: application/json" -d '{"path":"/path/to/document","process_table":true,"table_strategy":"hq"}' http://localhost:6000/v1/dataprep +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./your_file.pdf" \ + -F "process_table=true" \ + -F "table_strategy=hq" \ + http://localhost:6007/v1/dataprep ``` diff --git a/comps/dataprep/qdrant/config.py b/comps/dataprep/qdrant/config.py index 2b30a3682..7cf37f404 100644 --- a/comps/dataprep/qdrant/config.py +++ b/comps/dataprep/qdrant/config.py @@ -7,7 +7,7 @@ EMBED_MODEL = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2") # Qdrant configuration -QDRANT_HOST = os.getenv("QDRANT", "localhost") +QDRANT_HOST = os.getenv("QDRANT_HOST", "localhost") QDRANT_PORT = int(os.getenv("QDRANT_PORT", 6333)) COLLECTION_NAME = os.getenv("COLLECTION_NAME", "rag-qdrant") diff --git a/comps/dataprep/qdrant/docker/Dockerfile b/comps/dataprep/qdrant/docker/Dockerfile index bdf0315e2..ff9f6b253 100644 --- a/comps/dataprep/qdrant/docker/Dockerfile +++ b/comps/dataprep/qdrant/docker/Dockerfile @@ -12,6 +12,7 @@ RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missin build-essential \ libgl1-mesa-glx \ libjemalloc-dev \ + default-jre \ vim RUN useradd -m -s /bin/bash user && \ @@ -22,13 +23,18 @@ USER user COPY comps /home/user/comps -RUN pip install --no-cache-dir --upgrade pip && \ - if [ ${ARCH} = "cpu" ]; then pip install torch --index-url https://download.pytorch.org/whl/cpu; fi && \ +RUN pip install --no-cache-dir --upgrade pip setuptools && \ + if [ ${ARCH} = "cpu" ]; then pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu; fi && \ pip install --no-cache-dir -r /home/user/comps/dataprep/qdrant/requirements.txt ENV PYTHONPATH=$PYTHONPATH:/home/user +USER root + +RUN mkdir -p /home/user/comps/dataprep/qdrant/uploaded_files && chown -R user /home/user/comps/dataprep/qdrant/uploaded_files + +USER user + WORKDIR /home/user/comps/dataprep/qdrant ENTRYPOINT ["python", "prepare_doc_qdrant.py"] - diff --git a/comps/dataprep/qdrant/docker/docker-compose-dataprep-qdrant.yaml b/comps/dataprep/qdrant/docker/docker-compose-dataprep-qdrant.yaml index e86dc2c4e..aaf2a17dd 100644 --- a/comps/dataprep/qdrant/docker/docker-compose-dataprep-qdrant.yaml +++ b/comps/dataprep/qdrant/docker/docker-compose-dataprep-qdrant.yaml @@ -9,19 +9,36 @@ services: ports: - "6333:6333" - "6334:6334" + tei-embedding-service: + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + container_name: tei-embedding-server + ports: + - "6006:80" + volumes: + - "./data:/data" + shm_size: 1g + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate dataprep-qdrant: image: opea/gen-ai-comps:dataprep-qdrant-xeon-server container_name: dataprep-qdrant-server + depends_on: + - qdrant-vector-db + - tei-embedding-service ports: - - "6000:6000" + - "6007:6007" ipc: host environment: no_proxy: ${no_proxy} http_proxy: ${http_proxy} https_proxy: ${https_proxy} - QDRANT: ${QDRANT} + QDRANT_HOST: ${QDRANT_HOST} QDRANT_PORT: ${QDRANT_PORT} COLLECTION_NAME: ${COLLECTION_NAME} + TEI_ENDPOINT: ${TEI_ENDPOINT} restart: unless-stopped networks: diff --git a/comps/dataprep/qdrant/prepare_doc_qdrant.py b/comps/dataprep/qdrant/prepare_doc_qdrant.py index 422854eec..fb8d66571 100644 --- a/comps/dataprep/qdrant/prepare_doc_qdrant.py +++ b/comps/dataprep/qdrant/prepare_doc_qdrant.py @@ -1,30 +1,31 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -import os +import json +from typing import List, Optional, Union -from config import COLLECTION_NAME, EMBED_MODEL, QDRANT_HOST, QDRANT_PORT +from config import COLLECTION_NAME, EMBED_MODEL, QDRANT_HOST, QDRANT_PORT, TEI_EMBEDDING_ENDPOINT +from fastapi import File, Form, HTTPException, UploadFile from langchain.text_splitter import RecursiveCharacterTextSplitter -from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceEmbeddings, HuggingFaceHubEmbeddings +from langchain_community.embeddings import HuggingFaceBgeEmbeddings from langchain_community.vectorstores import Qdrant +from langchain_huggingface import HuggingFaceEndpointEmbeddings from langchain_text_splitters import HTMLHeaderTextSplitter -from comps import DocPath, opea_microservices, opea_telemetry, register_microservice -from comps.dataprep.utils import document_loader, get_separators, get_tables_result +from comps import DocPath, opea_microservices, register_microservice +from comps.dataprep.utils import ( + document_loader, + encode_filename, + get_separators, + get_tables_result, + parse_html, + save_content_to_local_disk, +) -tei_embedding_endpoint = os.getenv("TEI_ENDPOINT") +upload_folder = "./uploaded_files/" -@register_microservice( - name="opea_service@prepare_doc_qdrant", - endpoint="/v1/dataprep", - host="0.0.0.0", - port=6000, - input_datatype=DocPath, - output_datatype=None, -) -@opea_telemetry -def ingest_documents(doc_path: DocPath): +def ingest_data_to_qdrant(doc_path: DocPath): """Ingest document to Qdrant.""" path = doc_path.path print(f"Parsing document {path}.") @@ -38,23 +39,30 @@ def ingest_documents(doc_path: DocPath): text_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on) else: text_splitter = RecursiveCharacterTextSplitter( - chunk_size=doc_path.chunk_size, chunk_overlap=100, add_start_index=True, separators=get_separators() + chunk_size=doc_path.chunk_size, + chunk_overlap=doc_path.chunk_overlap, + add_start_index=True, + separators=get_separators(), ) content = document_loader(path) + chunks = text_splitter.split_text(content) if doc_path.process_table and path.endswith(".pdf"): table_chunks = get_tables_result(path, doc_path.table_strategy) chunks = chunks + table_chunks print("Done preprocessing. Created ", len(chunks), " chunks of the original pdf") + # Create vectorstore - if tei_embedding_endpoint: + if TEI_EMBEDDING_ENDPOINT: # create embeddings using TEI endpoint service - embedder = HuggingFaceHubEmbeddings(model=tei_embedding_endpoint) + embedder = HuggingFaceEndpointEmbeddings(model=TEI_EMBEDDING_ENDPOINT) else: # create embeddings using local embedding model embedder = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL) + print("embedder created.") + # Batch size batch_size = 32 num_chunks = len(chunks) @@ -71,6 +79,78 @@ def ingest_documents(doc_path: DocPath): ) print(f"Processed batch {i//batch_size + 1}/{(num_chunks-1)//batch_size + 1}") + return True + + +@register_microservice( + name="opea_service@prepare_doc_qdrant", + endpoint="/v1/dataprep", + host="0.0.0.0", + port=6007, + input_datatype=DocPath, + output_datatype=None, +) +async def ingest_documents( + files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), + link_list: Optional[str] = Form(None), + chunk_size: int = Form(1500), + chunk_overlap: int = Form(100), + process_table: bool = Form(False), + table_strategy: str = Form("fast"), +): + print(f"files:{files}") + print(f"link_list:{link_list}") + + if files: + if not isinstance(files, list): + files = [files] + uploaded_files = [] + for file in files: + encode_file = encode_filename(file.filename) + save_path = upload_folder + encode_file + await save_content_to_local_disk(save_path, file) + ingest_data_to_qdrant( + DocPath( + path=save_path, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + process_table=process_table, + table_strategy=table_strategy, + ) + ) + uploaded_files.append(save_path) + print(f"Successfully saved file {save_path}") + + return {"status": 200, "message": "Data preparation succeeded"} + + if link_list: + link_list = json.loads(link_list) # Parse JSON string to list + if not isinstance(link_list, list): + raise HTTPException(status_code=400, detail="link_list should be a list.") + for link in link_list: + encoded_link = encode_filename(link) + save_path = upload_folder + encoded_link + ".txt" + content = parse_html([link])[0][0] + try: + await save_content_to_local_disk(save_path, content) + ingest_data_to_qdrant( + DocPath( + path=save_path, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + process_table=process_table, + table_strategy=table_strategy, + ) + ) + except json.JSONDecodeError: + raise HTTPException(status_code=500, detail="Fail to ingest data into qdrant.") + + print(f"Successfully saved link {link}") + + return {"status": 200, "message": "Data preparation succeeded"} + + raise HTTPException(status_code=400, detail="Must provide either a file or a string list.") + if __name__ == "__main__": opea_microservices["opea_service@prepare_doc_qdrant"].start() diff --git a/comps/dataprep/qdrant/requirements.txt b/comps/dataprep/qdrant/requirements.txt index eb92e628f..e5bcf80b3 100644 --- a/comps/dataprep/qdrant/requirements.txt +++ b/comps/dataprep/qdrant/requirements.txt @@ -8,6 +8,7 @@ huggingface_hub langchain langchain-community langchain-text-splitters +langchain_huggingface markdown numpy opentelemetry-api diff --git a/tests/test_dataprep_qdrant_langchain.sh b/tests/test_dataprep_qdrant_langchain.sh new file mode 100644 index 000000000..7d9e47708 --- /dev/null +++ b/tests/test_dataprep_qdrant_langchain.sh @@ -0,0 +1,116 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -x + +WORKPATH=$(dirname "$PWD") +LOG_PATH="$WORKPATH/tests" +ip_address=$(hostname -I | awk '{print $1}') + +function build_docker_images() { + cd $WORKPATH + + # dataprep qdrant image + docker build --no-cache -t opea/dataprep-qdrant:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/qdrant/docker/Dockerfile . +} + +function start_service() { + QDRANT_PORT=6360 + docker run -d --name="test-comps-dataprep-qdrant-langchain" -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p $QDRANT_PORT:6333 -p 6334:6334 --ipc=host qdrant/qdrant + tei_embedding_port=6361 + model="BAAI/bge-base-en-v1.5" + docker run -d --name="test-comps-dataprep-qdrant-langchain-tei" -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p $tei_embedding_port:80 -v ./data:/data --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 --model-id $model + dataprep_service_port=6362 + TEI_EMBEDDING_ENDPOINT="http://${ip_address}:${tei_embedding_port}" + COLLECTION_NAME="rag-qdrant" + docker run -d --name="test-comps-dataprep-qdrant-langchain-server" -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e QDRANT_HOST=$ip_address -e QDRANT_PORT=$QDRANT_PORT -e COLLECTION_NAME=$COLLECTION_NAME -e TEI_ENDPOINT=$TEI_EMBEDDING_ENDPOINT -p ${dataprep_service_port}:6007 --ipc=host opea/dataprep-qdrant:comps + sleep 1m +} + +function validate_services() { + local URL="$1" + local EXPECTED_RESULT="$2" + local SERVICE_NAME="$3" + local DOCKER_NAME="$4" + local INPUT_DATA="$5" + + if [[ $SERVICE_NAME == *"dataprep_upload_file"* ]]; then + cd $LOG_PATH + HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F 'files=@./dataprep_file.txt' -H 'Content-Type: multipart/form-data' "$URL") + elif [[ $SERVICE_NAME == *"dataprep_upload_link"* ]]; then + HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F 'link_list=["https://www.ces.tech/"]' "$URL") + else + HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL") + fi + HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://') + RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g') + + docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log + + # check response status + if [ "$HTTP_STATUS" -ne "200" ]; then + echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" + exit 1 + else + echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." + fi + # check response body + if [[ "$RESPONSE_BODY" != *"$EXPECTED_RESULT"* ]]; then + echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY" + exit 1 + else + echo "[ $SERVICE_NAME ] Content is as expected." + fi + + sleep 1s +} + +function validate_microservice() { + # tei for embedding service + validate_services \ + "${ip_address}:6361/embed" \ + "[[" \ + "tei_embedding" \ + "test-comps-dataprep-qdrant-langchain-tei" \ + '{"inputs":"What is Deep Learning?"}' + + # dataprep upload file + echo "Deep learning is a subset of machine learning that utilizes neural networks with multiple layers to analyze various levels of abstract data representations. It enables computers to identify patterns and make decisions with minimal human intervention by learning from large amounts of data." > $LOG_PATH/dataprep_file.txt + validate_services \ + "${ip_address}:6362/v1/dataprep" \ + "Data preparation succeeded" \ + "dataprep_upload_file" \ + "test-comps-dataprep-qdrant-langchain-server" + + # dataprep upload link + validate_services \ + "${ip_address}:6362/v1/dataprep" \ + "Data preparation succeeded" \ + "dataprep_upload_link" \ + "test-comps-dataprep-qdrant-langchain-server" + +} + +function stop_docker() { + cid=$(docker ps -aq --filter "name=test-comps-dataprep-qdrant-langchain*") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi + + rm $LOG_PATH/dataprep_file.txt +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + echo y | docker system prune + +} + +main