Fix Dataprep qdrant issues and add Test Script (opea-project#474)

Signed-off-by: letonghan <letong.han@intel.com>
wangkl2 · Aug 13, 2024 · a851abf · a851abf
1 parent 075e84f
commit a851abf
Show file tree

Hide file tree

Showing 7 changed files with 263 additions and 30 deletions.
diff --git a/comps/dataprep/qdrant/README.md b/comps/dataprep/qdrant/README.md
@@ -47,15 +47,15 @@ docker build -t opea/dataprep-qdrant:latest --build-arg https_proxy=$https_proxy
 ## Run Docker with CLI
 
 ```bash
-docker run -d --name="dataprep-qdrant-server" -p 6000:6000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/dataprep-qdrant:latest
+docker run -d --name="dataprep-qdrant-server" -p 6007:6007 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/dataprep-qdrant:latest
 ```
 
 ## Setup Environment Variables
 
 ```bash
 export http_proxy=${your_http_proxy}
 export https_proxy=${your_http_proxy}
-export QDRANT=${host_ip}
+export QDRANT_HOST=${host_ip}
 export QDRANT_PORT=6333
 export COLLECTION_NAME=${your_collection_name}
 ```
@@ -72,19 +72,32 @@ docker compose -f docker-compose-dataprep-qdrant.yaml up -d
 Once document preparation microservice for Qdrant is started, user can use below command to invoke the microservice to convert the document to embedding and save to the database.
 
 ```bash
-curl -X POST -H "Content-Type: application/json" -d '{"path":"/path/to/document"}' http://localhost:6000/v1/dataprep
+curl -X POST \
+    -H "Content-Type: multipart/form-data" \
+    -F "files=@./file1.txt" \
+    http://localhost:6007/v1/dataprep
 ```
 
 You can specify chunk_size and chunk_size by the following commands.
 
 ```bash
-curl -X POST -H "Content-Type: application/json" -d '{"path":"/path/to/document","chunk_size":1500,"chunk_overlap":100}' http://localhost:6000/v1/dataprep
+curl -X POST \
+    -H "Content-Type: multipart/form-data" \
+    -F "files=@./file1.txt" \
+    -F "chunk_size=1500" \
+    -F "chunk_overlap=100" \
+    http://localhost:6007/v1/dataprep
 ```
 
 We support table extraction from pdf documents. You can specify process_table and table_strategy by the following commands. "table_strategy" refers to the strategies to understand tables for table retrieval. As the setting progresses from "fast" to "hq" to "llm," the focus shifts towards deeper table understanding at the expense of processing speed. The default strategy is "fast".
 
 Note: If you specify "table_strategy=llm", You should first start TGI Service, please refer to 1.2.1, 1.3.1 in https://github.com/opea-project/GenAIComps/tree/main/comps/llms/README.md, and then `export TGI_LLM_ENDPOINT="http://${your_ip}:8008"`.
 
 ```bash
-curl -X POST -H "Content-Type: application/json" -d '{"path":"/path/to/document","process_table":true,"table_strategy":"hq"}' http://localhost:6000/v1/dataprep
+curl -X POST \
+    -H "Content-Type: multipart/form-data" \
+    -F "files=@./your_file.pdf" \
+    -F "process_table=true" \
+    -F "table_strategy=hq" \
+    http://localhost:6007/v1/dataprep
 ```
diff --git a/comps/dataprep/qdrant/config.py b/comps/dataprep/qdrant/config.py
@@ -7,7 +7,7 @@
 EMBED_MODEL = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
 
 # Qdrant configuration
-QDRANT_HOST = os.getenv("QDRANT", "localhost")
+QDRANT_HOST = os.getenv("QDRANT_HOST", "localhost")
 QDRANT_PORT = int(os.getenv("QDRANT_PORT", 6333))
 COLLECTION_NAME = os.getenv("COLLECTION_NAME", "rag-qdrant")
 

diff --git a/comps/dataprep/qdrant/docker/Dockerfile b/comps/dataprep/qdrant/docker/Dockerfile
@@ -12,6 +12,7 @@ RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missin
     build-essential \
     libgl1-mesa-glx \
     libjemalloc-dev \
+    default-jre \
     vim
 
 RUN useradd -m -s /bin/bash user && \
@@ -22,13 +23,18 @@ USER user
 
 COPY comps /home/user/comps
 
-RUN pip install --no-cache-dir --upgrade pip && \
-    if [ ${ARCH} = "cpu" ]; then pip install torch --index-url https://download.pytorch.org/whl/cpu; fi && \
+RUN pip install --no-cache-dir --upgrade pip setuptools && \
+    if [ ${ARCH} = "cpu" ]; then pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu; fi && \
     pip install --no-cache-dir -r /home/user/comps/dataprep/qdrant/requirements.txt
 
 ENV PYTHONPATH=$PYTHONPATH:/home/user
 
+USER root
+
+RUN mkdir -p /home/user/comps/dataprep/qdrant/uploaded_files && chown -R user /home/user/comps/dataprep/qdrant/uploaded_files
+
+USER user
+
 WORKDIR /home/user/comps/dataprep/qdrant
 
 ENTRYPOINT ["python", "prepare_doc_qdrant.py"]
-
diff --git a/comps/dataprep/qdrant/docker/docker-compose-dataprep-qdrant.yaml b/comps/dataprep/qdrant/docker/docker-compose-dataprep-qdrant.yaml
@@ -9,19 +9,36 @@ services:
     ports:
       - "6333:6333"
       - "6334:6334"
+  tei-embedding-service:
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
+    container_name: tei-embedding-server
+    ports:
+      - "6006:80"
+    volumes:
+      - "./data:/data"
+    shm_size: 1g
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+    command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate
   dataprep-qdrant:
     image: opea/gen-ai-comps:dataprep-qdrant-xeon-server
     container_name: dataprep-qdrant-server
+    depends_on:
+      - qdrant-vector-db
+      - tei-embedding-service
     ports:
-      - "6000:6000"
+      - "6007:6007"
     ipc: host
     environment:
       no_proxy: ${no_proxy}
       http_proxy: ${http_proxy}
       https_proxy: ${https_proxy}
-      QDRANT: ${QDRANT}
+      QDRANT_HOST: ${QDRANT_HOST}
       QDRANT_PORT: ${QDRANT_PORT}
       COLLECTION_NAME: ${COLLECTION_NAME}
+      TEI_ENDPOINT: ${TEI_ENDPOINT}
     restart: unless-stopped
 
 networks:

diff --git a/comps/dataprep/qdrant/prepare_doc_qdrant.py b/comps/dataprep/qdrant/prepare_doc_qdrant.py
@@ -1,30 +1,31 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-import os
+import json
+from typing import List, Optional, Union
 
-from config import COLLECTION_NAME, EMBED_MODEL, QDRANT_HOST, QDRANT_PORT
+from config import COLLECTION_NAME, EMBED_MODEL, QDRANT_HOST, QDRANT_PORT, TEI_EMBEDDING_ENDPOINT
+from fastapi import File, Form, HTTPException, UploadFile
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceEmbeddings, HuggingFaceHubEmbeddings
+from langchain_community.embeddings import HuggingFaceBgeEmbeddings
 from langchain_community.vectorstores import Qdrant
+from langchain_huggingface import HuggingFaceEndpointEmbeddings
 from langchain_text_splitters import HTMLHeaderTextSplitter
 
-from comps import DocPath, opea_microservices, opea_telemetry, register_microservice
-from comps.dataprep.utils import document_loader, get_separators, get_tables_result
+from comps import DocPath, opea_microservices, register_microservice
+from comps.dataprep.utils import (
+    document_loader,
+    encode_filename,
+    get_separators,
+    get_tables_result,
+    parse_html,
+    save_content_to_local_disk,
+)
 
-tei_embedding_endpoint = os.getenv("TEI_ENDPOINT")
+upload_folder = "./uploaded_files/"
 
 
-@register_microservice(
-    name="opea_service@prepare_doc_qdrant",
-    endpoint="/v1/dataprep",
-    host="0.0.0.0",
-    port=6000,
-    input_datatype=DocPath,
-    output_datatype=None,
-)
-@opea_telemetry
-def ingest_documents(doc_path: DocPath):
+def ingest_data_to_qdrant(doc_path: DocPath):
     """Ingest document to Qdrant."""
     path = doc_path.path
     print(f"Parsing document {path}.")
@@ -38,23 +39,30 @@ def ingest_documents(doc_path: DocPath):
         text_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
     else:
         text_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=doc_path.chunk_size, chunk_overlap=100, add_start_index=True, separators=get_separators()
+            chunk_size=doc_path.chunk_size,
+            chunk_overlap=doc_path.chunk_overlap,
+            add_start_index=True,
+            separators=get_separators(),
         )
 
     content = document_loader(path)
+
     chunks = text_splitter.split_text(content)
     if doc_path.process_table and path.endswith(".pdf"):
         table_chunks = get_tables_result(path, doc_path.table_strategy)
         chunks = chunks + table_chunks
     print("Done preprocessing. Created ", len(chunks), " chunks of the original pdf")
+
     # Create vectorstore
-    if tei_embedding_endpoint:
+    if TEI_EMBEDDING_ENDPOINT:
         # create embeddings using TEI endpoint service
-        embedder = HuggingFaceHubEmbeddings(model=tei_embedding_endpoint)
+        embedder = HuggingFaceEndpointEmbeddings(model=TEI_EMBEDDING_ENDPOINT)
     else:
         # create embeddings using local embedding model
         embedder = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL)
 
+    print("embedder created.")
+
     # Batch size
     batch_size = 32
     num_chunks = len(chunks)
@@ -71,6 +79,78 @@ def ingest_documents(doc_path: DocPath):
         )
         print(f"Processed batch {i//batch_size + 1}/{(num_chunks-1)//batch_size + 1}")
 
+    return True
+
+
+@register_microservice(
+    name="opea_service@prepare_doc_qdrant",
+    endpoint="/v1/dataprep",
+    host="0.0.0.0",
+    port=6007,
+    input_datatype=DocPath,
+    output_datatype=None,
+)
+async def ingest_documents(
+    files: Optional[Union[UploadFile, List[UploadFile]]] = File(None),
+    link_list: Optional[str] = Form(None),
+    chunk_size: int = Form(1500),
+    chunk_overlap: int = Form(100),
+    process_table: bool = Form(False),
+    table_strategy: str = Form("fast"),
+):
+    print(f"files:{files}")
+    print(f"link_list:{link_list}")
+
+    if files:
+        if not isinstance(files, list):
+            files = [files]
+        uploaded_files = []
+        for file in files:
+            encode_file = encode_filename(file.filename)
+            save_path = upload_folder + encode_file
+            await save_content_to_local_disk(save_path, file)
+            ingest_data_to_qdrant(
+                DocPath(
+                    path=save_path,
+                    chunk_size=chunk_size,
+                    chunk_overlap=chunk_overlap,
+                    process_table=process_table,
+                    table_strategy=table_strategy,
+                )
+            )
+            uploaded_files.append(save_path)
+            print(f"Successfully saved file {save_path}")
+
+        return {"status": 200, "message": "Data preparation succeeded"}
+
+    if link_list:
+        link_list = json.loads(link_list)  # Parse JSON string to list
+        if not isinstance(link_list, list):
+            raise HTTPException(status_code=400, detail="link_list should be a list.")
+        for link in link_list:
+            encoded_link = encode_filename(link)
+            save_path = upload_folder + encoded_link + ".txt"
+            content = parse_html([link])[0][0]
+            try:
+                await save_content_to_local_disk(save_path, content)
+                ingest_data_to_qdrant(
+                    DocPath(
+                        path=save_path,
+                        chunk_size=chunk_size,
+                        chunk_overlap=chunk_overlap,
+                        process_table=process_table,
+                        table_strategy=table_strategy,
+                    )
+                )
+            except json.JSONDecodeError:
+                raise HTTPException(status_code=500, detail="Fail to ingest data into qdrant.")
+
+            print(f"Successfully saved link {link}")
+
+        return {"status": 200, "message": "Data preparation succeeded"}
+
+    raise HTTPException(status_code=400, detail="Must provide either a file or a string list.")
+
 
 if __name__ == "__main__":
     opea_microservices["opea_service@prepare_doc_qdrant"].start()
diff --git a/comps/dataprep/qdrant/requirements.txt b/comps/dataprep/qdrant/requirements.txt
@@ -8,6 +8,7 @@ huggingface_hub
 langchain
 langchain-community
 langchain-text-splitters
+langchain_huggingface
 markdown
 numpy
 opentelemetry-api