Pinecone support for dataprep and retrieval microservice (#157)

Signed-off-by: Pallavi Jaini <pallavi.jaini@intel.com>
opea-project · Jun 26, 2024 · 8b6486b · 8b6486b
1 parent 4649d68
commit 8b6486b
Show file tree

Hide file tree

Showing 18 changed files with 438 additions and 1 deletion.
diff --git a/.github/workflows/docker/docker-compose.yaml b/.github/workflows/docker/docker-compose.yaml
@@ -14,7 +14,7 @@ services:
  pull_policy: always
  retriever-redis-server:
  build:
- dockerfile: comps/retrievers/langchain/docker/Dockerfile
+ dockerfile: comps/retrievers/langchain/redis/docker/Dockerfile
  extends: embedding-tei-server
  image: ${REGISTRY}/${REPO}:retriever-redis-server
  reranking-tei-server:

diff --git a/comps/dataprep/README.md b/comps/dataprep/README.md
@@ -22,6 +22,10 @@ For details, please refer to this [readme](milvus/README.md)
 
 For details, please refer to this [readme](qdrant/README.md)
 
+# Dataprep Microservice with Pinecone
+
+For details, please refer to this [readme](pinecone/README.md)
+
 # Dataprep Microservice with PGVector
 
 For details, please refer to this [readme](pgvector/README.md)
diff --git a/comps/dataprep/pinecone/README.md b/comps/dataprep/pinecone/README.md
@@ -0,0 +1,69 @@
+# Dataprep Microservice with Pinecone
+
+# 🚀Start Microservice with Python
+
+## Install Requirements
+
+```bash
+pip install -r requirements.txt
+```
+
+## Start Pinecone Server
+
+Please refer to this [readme](../../../vectorstores/langchain/pinecone/README.md).
+
+## Setup Environment Variables
+
+```bash
+export http_proxy=${your_http_proxy}
+export https_proxy=${your_http_proxy}
+export PINECONE_API_KEY=${PINECONE_API_KEY}
+export PINECONE_INDEX_NAME=${PINECONE_INDEX_NAME}
+```
+
+## Start Document Preparation Microservice for Pinecone with Python Script
+
+Start document preparation microservice for Pinecone with below command.
+
+```bash
+python prepare_doc_pinecone.py
+```
+
+# 🚀Start Microservice with Docker
+
+## Build Docker Image
+
+```bash
+cd ../../../../
+docker build -t opea/dataprep-pinecone:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/pinecone/docker/Dockerfile .
+```
+
+## Run Docker with CLI
+
+```bash
+docker run -d --name="dataprep-pinecone-server" -p 6000:6000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/dataprep-pinecone:latest
+```
+
+## Setup Environment Variables
+
+```bash
+export http_proxy=${your_http_proxy}
+export https_proxy=${your_http_proxy}
+export PINECONE_API_KEY=${PINECONE_API_KEY}
+export PINECONE_INDEX_NAME=${PINECONE_INDEX_NAME}
+```
+
+## Run Docker with Docker Compose
+
+```bash
+cd comps/dataprep/pinecone/docker
+docker compose -f docker-compose-dataprep-pinecone.yaml up -d
+```
+
+# Invoke Microservice
+
+Once document preparation microservice for Pinecone is started, user can use below command to invoke the microservice to convert the document to embedding and save to the database.
+
+```bash
+curl -X POST -H "Content-Type: application/json" -d '{"path":"/path/to/document"}' http://localhost:6000/v1/dataprep
+```
diff --git a/comps/dataprep/pinecone/__init__.py b/comps/dataprep/pinecone/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
diff --git a/comps/dataprep/pinecone/config.py b/comps/dataprep/pinecone/config.py
@@ -0,0 +1,16 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+# Embedding model
+EMBED_MODEL = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
+
+# Pinecone configuration
+PINECONE_API_KEY = os.getenv("PINECONE_API_KEY", "xxx_xxx")
+PINECONE_INDEX_NAME = int(os.getenv("PINECONE_INDEX_NAME", "langchain-test"))
+
+# LLM/Embedding endpoints
+TGI_LLM_ENDPOINT = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080")
+TGI_LLM_ENDPOINT_NO_RAG = os.getenv("TGI_LLM_ENDPOINT_NO_RAG", "http://localhost:8081")
+TEI_EMBEDDING_ENDPOINT = os.getenv("TEI_ENDPOINT")
diff --git a/comps/dataprep/pinecone/docker/Dockerfile b/comps/dataprep/pinecone/docker/Dockerfile
@@ -0,0 +1,31 @@
+
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+FROM python:3.11-slim
+
+ENV LANG C.UTF-8
+
+RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
+ build-essential \
+ libgl1-mesa-glx \
+ libjemalloc-dev \
+ vim
+
+RUN useradd -m -s /bin/bash user && \
+ mkdir -p /home/user && \
+ chown -R user /home/user/
+
+USER user
+
+COPY comps /home/user/comps
+
+RUN pip install --no-cache-dir --upgrade pip && \
+ pip install --no-cache-dir -r /home/user/comps/dataprep/pinecone/requirements.txt
+
+ENV PYTHONPATH=$PYTHONPATH:/home/user
+
+WORKDIR /home/user/comps/dataprep/pinecone
+
+ENTRYPOINT ["python", "prepare_doc_pinecone.py"]
+
diff --git a/comps/dataprep/pinecone/docker/docker-compose-dataprep-pinecone.yaml b/comps/dataprep/pinecone/docker/docker-compose-dataprep-pinecone.yaml
@@ -0,0 +1,21 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+version: "3"
+services:
+ dataprep-pinecone:
+ image: opea/gen-ai-comps:dataprep-pinecone-xeon-server
+ container_name: dataprep-pinecone-server
+ ports:
+ - "6000:6000"
+ ipc: host
+ environment:
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ PINECONE_API_KEY: ${PINECONE_API_KEY}
+ PINECONE_INDEX_NAME: ${PINECONE_INDEX_NAME}
+ restart: unless-stopped
+
+networks:
+ default:
+ driver: bridge
diff --git a/comps/dataprep/pinecone/prepare_doc_pinecone.py b/comps/dataprep/pinecone/prepare_doc_pinecone.py
@@ -0,0 +1,60 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+from config import EMBED_MODEL, PINECONE_API_KEY, PINECONE_INDEX_NAME
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceEmbeddings, HuggingFaceHubEmbeddings
+from langchain_community.vectorstores import Pinecone
+
+from comps import DocPath, opea_microservices, opea_telemetry, register_microservice
+from comps.dataprep.utils import document_loader
+
+tei_embedding_endpoint = os.getenv("TEI_ENDPOINT")
+
+
+@register_microservice(
+ name="opea_service@prepare_doc_pinecone",
+ endpoint="/v1/dataprep",
+ host="0.0.0.0",
+ port=6000,
+ input_datatype=DocPath,
+ output_datatype=None,
+)
+@opea_telemetry
+def ingest_documents(doc_path: DocPath):
+ """Ingest document to Pinecone."""
+ doc_path = doc_path.path
+ print(f"Parsing document {doc_path}.")
+
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100, add_start_index=True)
+ content = document_loader(doc_path)
+ chunks = text_splitter.split_text(content)
+
+ print("Done preprocessing. Created ", len(chunks), " chunks of the original pdf")
+ # Create vectorstore
+ if tei_embedding_endpoint:
+ # create embeddings using TEI endpoint service
+ embedder = HuggingFaceHubEmbeddings(model=tei_embedding_endpoint)
+ else:
+ # create embeddings using local embedding model
+ embedder = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL)
+
+ # Batch size
+ batch_size = 32
+ num_chunks = len(chunks)
+ for i in range(0, num_chunks, batch_size):
+ batch_chunks = chunks[i : i + batch_size]
+ batch_texts = batch_chunks
+
+ _ = Pinecone.from_texts(
+ texts=batch_texts,
+ embedding=embedder,
+ index_name=PINECONE_INDEX_NAME,
+ )
+ print(f"Processed batch {i//batch_size + 1}/{(num_chunks-1)//batch_size + 1}")
+
+
+if __name__ == "__main__":
+ opea_microservices["opea_service@prepare_doc_pinecone"].start()
diff --git a/comps/dataprep/pinecone/requirements.txt b/comps/dataprep/pinecone/requirements.txt
@@ -0,0 +1,20 @@
+beautifulsoup4
+docarray[full]
+easyocr
+fastapi
+huggingface_hub
+langchain
+langchain-community
+langchain-pinecone
+langsmith
+numpy
+opentelemetry-api
+opentelemetry-exporter-otlp
+opentelemetry-sdk
+pandas
+Pillow
+pinecone-client
+pymupdf
+python-docx
+sentence_transformers
+shortuuid
diff --git a/comps/retrievers/langchain/pinecone/__init__.py b/comps/retrievers/langchain/pinecone/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
diff --git a/comps/retrievers/langchain/pinecone/config.py b/comps/retrievers/langchain/pinecone/config.py
@@ -0,0 +1,16 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+# Embedding model
+EMBED_MODEL = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
+
+# Pinecone configuration
+PINECONE_API_KEY = os.getenv("PINECONE_API_KEY", "xxx_xxx")
+PINECONE_INDEX_NAME = int(os.getenv("PINECONE_INDEX_NAME", "langchain-test"))
+
+# LLM/Embedding endpoints
+TGI_LLM_ENDPOINT = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080")
+TGI_LLM_ENDPOINT_NO_RAG = os.getenv("TGI_LLM_ENDPOINT_NO_RAG", "http://localhost:8081")
+TEI_EMBEDDING_ENDPOINT = os.getenv("TEI_ENDPOINT")
diff --git a/comps/retrievers/langchain/pinecone/docker/Dockerfile b/comps/retrievers/langchain/pinecone/docker/Dockerfile
@@ -0,0 +1,29 @@
+
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+FROM langchain/langchain:latest
+
+RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
+ libgl1-mesa-glx \
+ libjemalloc-dev \
+ vim
+
+RUN useradd -m -s /bin/bash user && \
+ mkdir -p /home/user && \
+ chown -R user /home/user/
+
+COPY comps /home/user/comps
+
+RUN chmod +x /home/user/comps/retrievers/langchain/pinecone/run.sh
+
+USER user
+
+RUN pip install --no-cache-dir --upgrade pip && \
+ pip install --no-cache-dir -r /home/user/comps/retrievers/requirements.txt
+
+ENV PYTHONPATH=$PYTHONPATH:/home/user
+
+WORKDIR /home/user/comps/retrievers/langchain/pinecone
+
+ENTRYPOINT ["/home/user/comps/retrievers/langchain/pinecone/run.sh"]
diff --git a/comps/retrievers/langchain/pinecone/docker/docker_compose_retriever.yaml b/comps/retrievers/langchain/pinecone/docker/docker_compose_retriever.yaml
@@ -0,0 +1,32 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+version: "3.8"
+
+services:
+ tei_xeon_service:
+ image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.2
+ container_name: tei-xeon-server
+ ports:
+ - "6060:80"
+ volumes:
+ - "./data:/data"
+ shm_size: 1g
+ command: --model-id ${RETRIEVE_MODEL_ID}
+ retriever:
+ image: opea/retriever-pinecone:latest
+ container_name: retriever-pinecone-server
+ ports:
+ - "7000:7000"
+ ipc: host
+ environment:
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ PINECONE_API_KEY: ${PINECONE_API_KEY}
+ INDEX_NAME: ${PINECONE_INDEX_NAME}
+ LANGCHAIN_API_KEY: ${LANGCHAIN_API_KEY}
+ restart: unless-stopped
+
+networks:
+ default:
+ driver: bridge