From 20e906ac8227da785692731b3d8ef46d757ffe0c Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Tue, 22 Oct 2024 21:30:29 -0400 Subject: [PATCH 01/22] arangodb prep | initial commit --- .gitignore | 1 + ARANGODB_README.md | 33 +++++++++++++++ langchain_test.py | 101 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 135 insertions(+) create mode 100644 ARANGODB_README.md create mode 100644 langchain_test.py diff --git a/.gitignore b/.gitignore index 1d1e0a389..9778bf8f7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ __pycache__ *.egg-info/ .DS_Store +.venv \ No newline at end of file diff --git a/ARANGODB_README.md b/ARANGODB_README.md new file mode 100644 index 000000000..76f4b7db3 --- /dev/null +++ b/ARANGODB_README.md @@ -0,0 +1,33 @@ +Instructions + +0. Create a virtual environment: + +```bash +python -m venv .venv + +source .venv/bin/activate +``` + +1. Install the required packages: + +```bash +pip install python-arango +pip install langchain_openai +pip install git+https://github.com/arangoml/langchain.git@arangodb#subdirectory=libs/community +``` + +2. Provision the ArangoDB with Vector Index image: + +```bash +docker create --name arango-vector -p 8529:8529 -e ARANGO_ROOT_PASSWORD=test jbajic/arangodb-arm:vector-index-preview + +docker start arango-vector +``` + +3. Set your `OPENAI_API_KEY` environment variable (contact Anthony for access) + +4. Run the test script to confirm LangChain is working: + +```bash +python langchain_test.py +``` \ No newline at end of file diff --git a/langchain_test.py b/langchain_test.py new file mode 100644 index 000000000..e33ea1687 --- /dev/null +++ b/langchain_test.py @@ -0,0 +1,101 @@ +from arango import ArangoClient +from langchain_community.chains.graph_qa.arangodb import ArangoGraphQAChain +from langchain_community.embeddings import OpenAIEmbeddings +from langchain_community.graphs.arangodb_graph import ArangoGraph +from langchain_community.graphs.graph_document import GraphDocument, Node, Relationship +from langchain_community.vectorstores.arangodb_vector import ArangoVector +from langchain_core.documents import Document +from langchain_openai import OpenAI + +system_db = ArangoClient().db("_system", password="test", verify=True) +system_db.delete_database("langchain_test", ignore_missing=True) +system_db.create_database("langchain_test") +db = ArangoClient().db("langchain_test", password="test", verify=True) + +#################### +# Test ArangoGraph # +#################### + +# Create nodes +node1 = Node(id="1", type="Person", properties={"name": "John", "age": 30}) +node2 = Node(id="2", type="Person", properties={"name": "Jane", "age": 28}) +node3 = Node(id="3", type="Club", properties={"name": "Karate Club"}) + +# Create relationships +relationship1 = Relationship(source=node1, target=node3, type="MEMBER_OF", properties={"joined_date": "2020-01-01"}) +relationship2 = Relationship(source=node2, target=node3, type="MEMBER_OF", properties={"joined_date": "2019-05-15"}) +relationship3 = Relationship(source=node1, target=node2, type="KNOWS", properties={"since": "2018-03-10"}) + +# Create source document +source_doc = Document( + page_content="John and Jane are members of the Karate Club. They know each other.", + metadata={"source": "club_records"}, +) + +# Create GraphDocument +graph_doc = GraphDocument( + nodes=[node1, node2, node3], relationships=[relationship1, relationship2, relationship3], source=source_doc +) + +arango_graph = ArangoGraph(db=db, include_examples=False) +arango_graph.add_graph_documents([graph_doc], graph_name="NewGraph", include_source=True) + +##################### +# Test ArangoVector # +##################### + +# Add some sample texts +texts = [ + "The quick brown fox jumps over the lazy dog", + "A journey of a thousand miles begins with a single step", + "To be or not to be, that is the question", + "All that glitters is not gold", + "hello what's up", +] + +vector_store = ArangoVector.from_texts( + texts, + OpenAIEmbeddings(), + database=db, + collection_name="vector_test", + index_name="vector_index", + distance_strategy="COSINE", +) + +texts_2 = ["the dog, cat, and mouse are all mammals"] +vector_store.add_texts(texts_2) + +# Perform a similarity search +query = "What animal is mentioned?" +results = vector_store.similarity_search_with_score(query, k=2) + +print("Search results for query:", query) +for doc, score in results: + print(f"Content: {doc.page_content}") + print(f"Metadata: {doc.metadata}") + print(f"Score: {score}") + print("---") + +# Try another query +query2 = "What's a famous Shakespeare quote?" +results2 = vector_store.similarity_search_with_score(query2, k=1) + +print("\nSearch results for query:", query2) +for doc, score in results2: + print(f"Content: {doc.page_content}") + print(f"Metadata: {doc.metadata}") + print(f"Score: {score}") + print("---") + +########################### +# Test ArangoGraphQAChain # +########################### + +llm = OpenAI(temperature=0) +graph = ArangoGraph(db=db, include_examples=False, graph_name="NewGraph") +chain = ArangoGraphQAChain.from_llm(llm, graph=graph, allow_dangerous_requests=True) +chain.verbose = True +chain.execute_aql_query = False +chain.run("What is the name of the club?") +chain.execute_aql_query = True +chain.run("What is the name of the club?") From 388d802b42cec611b1fe51efb697e62d8d005af4 Mon Sep 17 00:00:00 2001 From: Ajay Kallepalli <72517322+ajaykallepalli@users.noreply.github.com> Date: Mon, 25 Nov 2024 14:28:02 -0800 Subject: [PATCH 02/22] ArangoDB: Feedback management (#11) * initial commit * updating feedback management readme to match arango * Removing comments above import * Working API test and updated readme * Working docker compose file * Docker compose creating network and docker image * code review * update readme & dev yaml * delete dev files * Delete arango_store.py --------- Co-authored-by: Anthony Mahanna --- comps/feedback_management/arango/Dockerfile | 30 +++ comps/feedback_management/arango/README.md | 172 ++++++++++++++++ .../feedback_management/arango/arango_conn.py | 32 +++ .../arango/arango_store.py | 186 ++++++++++++++++++ comps/feedback_management/arango/config.py | 13 ++ .../docker-compose-user-feedback-arango.yaml | 38 ++++ comps/feedback_management/arango/feedback.py | 172 ++++++++++++++++ .../arango/requirements.txt | 1 + 8 files changed, 644 insertions(+) create mode 100644 comps/feedback_management/arango/Dockerfile create mode 100644 comps/feedback_management/arango/README.md create mode 100644 comps/feedback_management/arango/arango_conn.py create mode 100644 comps/feedback_management/arango/arango_store.py create mode 100644 comps/feedback_management/arango/config.py create mode 100644 comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml create mode 100644 comps/feedback_management/arango/feedback.py create mode 100644 comps/feedback_management/arango/requirements.txt diff --git a/comps/feedback_management/arango/Dockerfile b/comps/feedback_management/arango/Dockerfile new file mode 100644 index 000000000..95ac359e6 --- /dev/null +++ b/comps/feedback_management/arango/Dockerfile @@ -0,0 +1,30 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +ENV LANG=C.UTF-8 + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + build-essential \ + libjemalloc-dev \ + libgl1-mesa-glx + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps +COPY requirements.txt /home/user/ + +RUN pip install --no-cache-dir --upgrade pip setuptools && \ + pip install --no-cache-dir -r /home/user/comps/feedback_management/arango/requirements.txt && \ + pip install --no-cache-dir -r /home/user/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +WORKDIR /home/user/comps/feedback_management/arango + +ENTRYPOINT ["python", "feedback.py"] diff --git a/comps/feedback_management/arango/README.md b/comps/feedback_management/arango/README.md new file mode 100644 index 000000000..8eb223ce9 --- /dev/null +++ b/comps/feedback_management/arango/README.md @@ -0,0 +1,172 @@ +# πŸ—¨ Feedback Management Microservice with ArangoDB + +This README provides setup guides and all the necessary information about the Feedback Management microservice with ArangoDB database. + +--- + +## Setup Environment Variables + +```bash +export ARANGO_HOST=${ARANGO_HOST} +export ARANGO_PORT=${ARANGO_PORT} +export ARANGO_USERNAME=${ARANGO_USERNAME} +export ARANGO_PASSWORD=${ARANGO_PASSWORD} +export DB_NAME=${DB_NAME} +export COLLECTION_NAME=${COLLECTION_NAME} +export PROTOCOL=${PROTOCOL} +export PYTHONPATH={Path to base of directory} +``` + +--- + +## πŸš€Start Microservice with Docker + +### Build Docker Image + +```bash +cd ~/GenAIComps +docker build -t opea/feedbackmanagement-arango-server:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/feedback_management/arango/Dockerfile . +``` + +### Run Docker with CLI + +- Run ArangoDB image container + + ```bash + docker run -d -p 8529:8529 --name=arango arangodb/arangodb:latest + ``` + +- Run Feedback Management microservice + + ```bash + docker run -d -p 6016:6016 \ + --name="feedbackmanagement-arango-server" \ + -e http_proxy=$http_proxy \ + -e https_proxy=$https_proxy \ + -e no_proxy=$no_proxy \ + -e ARANGO_HOST=${ARANGO_HOST} \ + -e ARANGO_PORT=${ARANGO_PORT} \ + -e ARANGO_USERNAME=${ARANGO_USERNAME} \ + -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ + -e DB_NAME=${DB_NAME} \ + -e PROTOCOL=${PROTOCOL} \ + -e COLLECTION_NAME=${COLLECTION_NAME} \ + opea/feedbackmanagement-arango-server:latest + + ``` + +--- + +### βœ… Invoke Microservice + +The Feedback Management microservice exposes the following API endpoints: + +- Save feedback data + + ```bash + curl -X 'POST' \ + http://${host_ip}:6016/v1/feedback/create \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "chat_id": "66445d4f71c7eff23d44f78d", + "chat_data": { + "user": "test", + "messages": [ + { + "role": "system", + "content": "You are helpful assistant" + }, + { + "role": "user", + "content": "hi", + "time": "1724915247" + }, + { + "role": "assistant", + "content": "Hi, may I help you?", + "time": "1724915249" + } + ] + }, + "feedback_data": { + "comment": "Moderate", + "rating": 3, + "is_thumbs_up": true + }}' + + + # Take note that chat_id here would be the id get from feedback_arango service + # If you do not wish to maintain chat history via feedback_arango service, you may generate some random uuid for it or just leave it empty. + ``` + +- Update feedback data by feedback_id + + ```bash + curl -X 'POST' \ + http://${host_ip}:6016/v1/feedback/create \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "chat_id": "66445d4f71c7eff23d44f78d", + "chat_data": { + "user": "test", + "messages": [ + { + "role": "system", + "content": "You are helpful assistant" + }, + { + "role": "user", + "content": "hi", + "time": "1724915247" + }, + { + "role": "assistant", + "content": "Hi, may I help you?", + "time": "1724915249" + } + ] + }, + "feedback_data": { + "comment": "Fair and Moderate answer", + "rating": 2, + "is_thumbs_up": true + }, + "feedback_id": "{feedback_id of the data that wanted to update}"}' + + # Just include any feedback_data field value that you wanted to update. + ``` + +- Retrieve feedback data by user + + ```bash + curl -X 'POST' \ + http://${host_ip}:6016/v1/feedback/get \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test"}' + ``` + +- Retrieve feedback data by feedback_id + + ```bash + curl -X 'POST' \ + http://${host_ip}:6016/v1/feedback/get \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test", "feedback_id":"{feedback_id returned from save feedback route above}"}' + ``` + +- Delete feedback data by feedback_id + + ```bash + curl -X 'POST' \ + http://${host_ip}:6016/v1/feedback/delete \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test", "feedback_id":"{feedback_id to be deleted}"}' + ``` diff --git a/comps/feedback_management/arango/arango_conn.py b/comps/feedback_management/arango/arango_conn.py new file mode 100644 index 000000000..f9ac9e411 --- /dev/null +++ b/comps/feedback_management/arango/arango_conn.py @@ -0,0 +1,32 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from arango import ArangoClient as PythonArangoClient +from arango.database import StandardDatabase +from config import ARANGO_HOST, ARANGO_PASSWORD, ARANGO_PORT, ARANGO_USERNAME, DB_NAME, PROTOCOL + + +class ArangoClient: + conn_url = f"{PROTOCOL}://{ARANGO_HOST}:{ARANGO_PORT}/" + + @staticmethod + def get_db_client() -> StandardDatabase: + try: + # Create client + client = PythonArangoClient(hosts=ArangoClient.conn_url) + + # First connect to _system database + sys_db = client.db("_system", username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + # Create target database if it doesn't exist + if not sys_db.has_database(DB_NAME): + sys_db.create_database(DB_NAME) + + # Now connect to the target database + db = client.db(DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + return db + + except Exception as e: + print(e) + raise e diff --git a/comps/feedback_management/arango/arango_store.py b/comps/feedback_management/arango/arango_store.py new file mode 100644 index 000000000..cd22b8078 --- /dev/null +++ b/comps/feedback_management/arango/arango_store.py @@ -0,0 +1,186 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from arango_conn import ArangoClient +from config import COLLECTION_NAME +from pydantic import BaseModel + + +class FeedbackStore: + + def __init__( + self, + user: str, + ): + self.user = user + + def initialize_storage(self) -> None: + self.db_client = ArangoClient.get_db_client() + + if not self.db_client.has_collection(COLLECTION_NAME): + self.db_client.create_collection(COLLECTION_NAME) + + self.collection = self.db_client.collection(COLLECTION_NAME) + + def save_feedback(self, feedback_data: BaseModel) -> str: + """Stores a new feedback data into the storage. + + Args: + feedback_data (object): The document to be stored. + + Returns: + str: The ID of the inserted feedback data. + + Raises: + Exception: If an error occurs while storing the feedback_data. + """ + try: + model_dump = feedback_data.model_dump(by_alias=True, mode="json", exclude={"feedback_id"}) + + inserted_feedback_data = self.collection.insert(model_dump) + + feedback_id = str(inserted_feedback_data["_key"]) + + return feedback_id + + except Exception as e: + print(e) + raise Exception(e) + + def update_feedback(self, feedback_data: BaseModel) -> bool: + """Update a feedback data in the collection with given id. + + Args: + feedback_id (str): The ID of the data to be updated. + updated_data (object): The data to be updated in the entry. + + Returns: + bool: True if the data updated successfully, False otherwise. + + Raises: + KeyError: If the document with ID is not found. + Exception: If the user does not match with the document user. + Exception: If an error occurs while updating the feedback data. + """ + _key = feedback_data.feedback_id + document = self.collection.get(_key) + + if document is None: + raise KeyError(f"Document with ID: {_key} not found.") + + if document["chat_data"]["user"] != self.user: + raise Exception(f"User mismatch. Document with ID: {_key} does not belong to user: {self.user}") + + try: + model_dump = feedback_data.feedback_data.model_dump(by_alias=True, mode="json") + + self.collection.update( + {"_key": _key, "feedback_data": model_dump}, + merge=True, + keep_none=False, + ) + + print(f"Updated document: {_key} !") + + return True + + except Exception as e: + print("Not able to update the data.") + print(e) + raise Exception(e) + + def get_all_feedback_of_user(self) -> list[dict]: + """Retrieves all feedback data of a user from the collection. + + Returns: + list[dict] | None: List of dict of feedback data of the user, None otherwise. + + Raises: + Exception: If there is an error while retrieving data. + """ + try: + feedback_data_list: list = [] + + # TODO: Clarify if we actually want to omit the `feedback_data` field. + # Implemented using MongoDB Feedback Management as a reference. + cursor = """ + FOR doc IN @@collection + FILTER doc.chat_data.user == @user + RETURN UNSET(doc, "feedback_data") + """ + + cursor = self.db_client.aql.execute( + cursor, bind_vars={"@collection": self.collection.name, "user": self.user} + ) + + for document in cursor: + document["feedback_id"] = str(document["_key"]) + del document["_id"] + del document["_key"] + del document["_rev"] + + feedback_data_list.append(document) + + return feedback_data_list + + except Exception as e: + print(e) + raise Exception(e) + + def get_feedback_by_id(self, feedback_id: str) -> dict | None: + """Retrieves a user feedback data from the collection based on the given feedback ID. + + Args: + feedback_id (str): The ID of the feedback data to retrieve. + + Returns: + dict | None: The user's feedback data if found, None otherwise. + + Raises: + KeyError: If document with ID is not found. + Exception: If the user does not match with the document user. + """ + response = self.collection.get(feedback_id) + + if response is None: + raise KeyError(f"Feedback with ID: {feedback_id} not found.") + + if response["chat_data"]["user"] != self.user: + raise Exception(f"User mismatch. Feedback with ID: {feedback_id} does not belong to user: {self.user}") + + del response["_id"] + del response["_key"] + del response["_rev"] + + return response + + def delete_feedback(self, feedback_id: str) -> bool: + """Delete a feedback data from collection by given feedback_id. + + Args: + feedback_id(str): The ID of the feedback data to be deleted. + + Returns: + bool: True if feedback is successfully deleted, False otherwise. + + Raises: + KeyError: If the provided feedback_id is invalid: + Exception: If the user does not match with the document user. + Exception: If any errors occurs during delete process. + """ + response = self.collection.get(feedback_id) + + if response is None: + raise KeyError(f"Feedback with ID: {feedback_id} not found.") + + if response["chat_data"]["user"] != self.user: + raise Exception(f"User mismatch. Feedback with ID: {feedback_id} does not belong to user: {self.user}") + + try: + response = self.collection.delete(feedback_id) + print(f"Deleted document: {feedback_id} !") + + return True + except Exception as e: + print(e) + raise Exception("Not able to delete the data.") diff --git a/comps/feedback_management/arango/config.py b/comps/feedback_management/arango/config.py new file mode 100644 index 000000000..e3272febf --- /dev/null +++ b/comps/feedback_management/arango/config.py @@ -0,0 +1,13 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +# ARANGO configuration +ARANGO_HOST = os.getenv("ARANGO_HOST", "localhost") +ARANGO_PORT = os.getenv("ARANGO_PORT", 8529) +ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") +ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD", "test") +DB_NAME = os.getenv("DB_NAME", "OPEA") +COLLECTION_NAME = os.getenv("COLLECTION_NAME", "Feedback") +PROTOCOL = os.getenv("PROTOCOL", "http") diff --git a/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml b/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml new file mode 100644 index 000000000..f4be0c845 --- /dev/null +++ b/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml @@ -0,0 +1,38 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3" +services: + arangodb: + image: arangodb/arangodb:latest + container_name: arangodb + ports: + - 8529:8529 + environment: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + no_proxy: ${no_proxy} + ARANGO_ROOT_PASSWORD: ${ARANGO_ROOT_PASSWORD} + + feedbackmanagement-arango: + image: opea/feedbackmanagement-arango:latest + container_name: feedbackmanagement-arango-server + ports: + - "6016:6016" + ipc: host + environment: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + no_proxy: ${no_proxy} + ARANGO_HOST: ${ARANGO_HOST} + ARANGO_PORT: ${ARANGO_PORT} + ARANGO_USERNAME: ${ARANGO_USERNAME} + ARANGO_PASSWORD: ${ARANGO_PASSWORD} + PROTOCOL: ${PROTOCOL} + DB_NAME: ${DB_NAME} + COLLECTION_NAME: ${COLLECTION_NAME} + restart: unless-stopped + +networks: + feedback_network: + driver: bridge diff --git a/comps/feedback_management/arango/feedback.py b/comps/feedback_management/arango/feedback.py new file mode 100644 index 000000000..f1efa6f43 --- /dev/null +++ b/comps/feedback_management/arango/feedback.py @@ -0,0 +1,172 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os +from typing import Annotated, Optional + +from arango_store import FeedbackStore +from fastapi import HTTPException +from pydantic import BaseModel, Field + +from comps import CustomLogger +from comps.cores.mega.micro_service import opea_microservices, register_microservice +from comps.cores.proto.api_protocol import ChatCompletionRequest + +logger = CustomLogger("feedback_arango") +logflag = os.getenv("LOGFLAG", False) + + +class FeedbackData(BaseModel): + """This class represents the data model of FeedbackData collected to store in database.". + + Attributes: + is_thumbs_up (bool): True if the response is satisfy, False otherwise. + rating: (int)[Optional]: Score rating. Range from 0 (bad rating) to 5(good rating). + comment (str)[Optional]: Comment given for response. + """ + + is_thumbs_up: bool + rating: Annotated[Optional[int], Field(ge=0, le=5)] = None + comment: Optional[str] = None + + +class ChatFeedback(BaseModel): + """This class represents the model for chat to collect FeedbackData together with ChatCompletionRequest data to store in database. + + Attributes: + chat_data (ChatCompletionRequest): ChatCompletionRequest object containing chat data to be stored. + feedback_data (FeedbackData): FeedbackData object containing feedback data for chat to be stored. + chat_id (str)[Optional]: The chat_id associated to the chat to be store together with feedback data. + feedback_id (str)[Optional]: The feedback_id of feedback data to be retrieved from database. + """ + + chat_data: ChatCompletionRequest + feedback_data: FeedbackData + chat_id: Optional[str] = None + feedback_id: Optional[str] = None + + +class FeedbackId(BaseModel): + """This class represent the data model for retrieve feedback data stored in database. + + Attributes: + user (str): The user of the requested feedback data. + feedback_id (str): The feedback_id of feedback data to be retrieved from database. + """ + + user: str + feedback_id: Optional[str] = None + + +@register_microservice( + name="opea_service@feedback_arango", + endpoint="/v1/feedback/create", + host="0.0.0.0", + input_datatype=FeedbackData, + port=6016, +) +async def create_feedback_data(feedback: ChatFeedback): + """Creates and stores a feedback data in database. + + Args: + feedback (ChatFeedback): The ChatFeedback class object containing feedback data to be stored. + + Returns: + response (str/bool): FeedbackId of the object created in database. True if data update successfully. + """ + if logflag: + logger.info(feedback) + + try: + feedback_store = FeedbackStore(feedback.chat_data.user) + feedback_store.initialize_storage() + if feedback.feedback_id is None: + response = feedback_store.save_feedback(feedback) + else: + response = feedback_store.update_feedback(feedback) + + if logflag: + logger.info(response) + return response + + except Exception as e: + logger.info(f"An error occurred: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + + +@register_microservice( + name="opea_service@feedback_arango", + endpoint="/v1/feedback/get", + host="0.0.0.0", + input_datatype=FeedbackId, + port=6016, +) +async def get_feedback(feedback: FeedbackId): + """Retrieves feedback_data from feedback store based on provided FeedbackId or user. + + Args: + feedback (FeedbackId): The FeedbackId object containing user and feedback_id or chat_id. + + Returns: + JSON: Retrieved feedback data if successful, error otherwise. + """ + if logflag: + logger.info(feedback) + + try: + feedback_store = FeedbackStore(feedback.user) + feedback_store.initialize_storage() + if feedback.feedback_id: + response = feedback_store.get_feedback_by_id(feedback.feedback_id) + else: + response = feedback_store.get_all_feedback_of_user() + + if logflag: + logger.info(response) + + return response + + except Exception as e: + logger.info(f"An error occurred: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + + +@register_microservice( + name="opea_service@feedback_arango", + endpoint="/v1/feedback/delete", + host="0.0.0.0", + input_datatype=FeedbackId, + port=6016, +) +async def delete_feedback(feedback: FeedbackId): + """Delete a feedback data from feedback store by given feedback Id. + + Args: + feedback (FeedbackId): The FeedbackId object containing user and feedback_id or chat_id + + Returns: + Result of deletion if successful, None otherwise. + """ + if logflag: + logger.info(feedback) + + try: + feedback_store = FeedbackStore(feedback.user) + feedback_store.initialize_storage() + if feedback.feedback_id is None: + raise Exception("feedback_id is required.") + else: + response = feedback_store.delete_feedback(feedback.feedback_id) + + if logflag: + logger.info(response) + + return response + + except Exception as e: + logger.info(f"An error occurred: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + + +if __name__ == "__main__": + opea_microservices["opea_service@feedback_arango"].start() diff --git a/comps/feedback_management/arango/requirements.txt b/comps/feedback_management/arango/requirements.txt new file mode 100644 index 000000000..9e5d0de8e --- /dev/null +++ b/comps/feedback_management/arango/requirements.txt @@ -0,0 +1 @@ +python-arango \ No newline at end of file From 6973d914c8932dd633914ccff24beacfa3c075c6 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Mon, 25 Nov 2024 19:05:21 -0500 Subject: [PATCH 03/22] remove: `PROTOCOL` env --- comps/feedback_management/arango/README.md | 2 -- comps/feedback_management/arango/arango_conn.py | 4 ++-- comps/feedback_management/arango/config.py | 3 +-- .../arango/docker-compose-user-feedback-arango.yaml | 3 +-- 4 files changed, 4 insertions(+), 8 deletions(-) diff --git a/comps/feedback_management/arango/README.md b/comps/feedback_management/arango/README.md index 8eb223ce9..e0f070b68 100644 --- a/comps/feedback_management/arango/README.md +++ b/comps/feedback_management/arango/README.md @@ -13,7 +13,6 @@ export ARANGO_USERNAME=${ARANGO_USERNAME} export ARANGO_PASSWORD=${ARANGO_PASSWORD} export DB_NAME=${DB_NAME} export COLLECTION_NAME=${COLLECTION_NAME} -export PROTOCOL=${PROTOCOL} export PYTHONPATH={Path to base of directory} ``` @@ -49,7 +48,6 @@ docker build -t opea/feedbackmanagement-arango-server:latest --build-arg https_p -e ARANGO_USERNAME=${ARANGO_USERNAME} \ -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ -e DB_NAME=${DB_NAME} \ - -e PROTOCOL=${PROTOCOL} \ -e COLLECTION_NAME=${COLLECTION_NAME} \ opea/feedbackmanagement-arango-server:latest diff --git a/comps/feedback_management/arango/arango_conn.py b/comps/feedback_management/arango/arango_conn.py index f9ac9e411..84ded0428 100644 --- a/comps/feedback_management/arango/arango_conn.py +++ b/comps/feedback_management/arango/arango_conn.py @@ -3,11 +3,11 @@ from arango import ArangoClient as PythonArangoClient from arango.database import StandardDatabase -from config import ARANGO_HOST, ARANGO_PASSWORD, ARANGO_PORT, ARANGO_USERNAME, DB_NAME, PROTOCOL +from config import ARANGO_HOST, ARANGO_PASSWORD, ARANGO_PORT, ARANGO_USERNAME, DB_NAME class ArangoClient: - conn_url = f"{PROTOCOL}://{ARANGO_HOST}:{ARANGO_PORT}/" + conn_url = f"arangodb://{ARANGO_HOST}:{ARANGO_PORT}/" @staticmethod def get_db_client() -> StandardDatabase: diff --git a/comps/feedback_management/arango/config.py b/comps/feedback_management/arango/config.py index e3272febf..c332de7e5 100644 --- a/comps/feedback_management/arango/config.py +++ b/comps/feedback_management/arango/config.py @@ -9,5 +9,4 @@ ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD", "test") DB_NAME = os.getenv("DB_NAME", "OPEA") -COLLECTION_NAME = os.getenv("COLLECTION_NAME", "Feedback") -PROTOCOL = os.getenv("PROTOCOL", "http") +COLLECTION_NAME = os.getenv("COLLECTION_NAME", "Feedback") \ No newline at end of file diff --git a/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml b/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml index f4be0c845..62ab0df54 100644 --- a/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml +++ b/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml @@ -28,11 +28,10 @@ services: ARANGO_PORT: ${ARANGO_PORT} ARANGO_USERNAME: ${ARANGO_USERNAME} ARANGO_PASSWORD: ${ARANGO_PASSWORD} - PROTOCOL: ${PROTOCOL} DB_NAME: ${DB_NAME} COLLECTION_NAME: ${COLLECTION_NAME} restart: unless-stopped networks: - feedback_network: + default: driver: bridge From 5e9742c52b6ab2b5f6f40b3ce2b171835b824fc1 Mon Sep 17 00:00:00 2001 From: SLasyaN Date: Tue, 26 Nov 2024 13:47:41 -0800 Subject: [PATCH 04/22] ArangoDB: PromptRegistry (#8) * Initial commit * remove unnecessary files * code review * update: `prompt_search` * new: `ARANGO_PROTOCOL` * README * cleanup --------- Co-authored-by: lasyasn Co-authored-by: Anthony Mahanna --- comps/feedback_management/README.md | 4 + comps/feedback_management/arango/README.md | 4 + .../feedback_management/arango/arango_conn.py | 4 +- comps/feedback_management/arango/config.py | 3 +- .../docker-compose-user-feedback-arango.yaml | 3 +- comps/prompt_registry/README.md | 4 + comps/prompt_registry/arango/DockerFile | 30 +++ comps/prompt_registry/arango/README.md | 120 ++++++++++ comps/prompt_registry/arango/arango_conn.py | 32 +++ comps/prompt_registry/arango/arango_store.py | 213 ++++++++++++++++++ comps/prompt_registry/arango/config.py | 13 ++ ...docker-compose-prompt-registry-arango.yaml | 38 ++++ comps/prompt_registry/arango/prompt.py | 148 ++++++++++++ comps/prompt_registry/arango/requirements.txt | 1 + 14 files changed, 613 insertions(+), 4 deletions(-) create mode 100644 comps/prompt_registry/arango/DockerFile create mode 100644 comps/prompt_registry/arango/README.md create mode 100644 comps/prompt_registry/arango/arango_conn.py create mode 100644 comps/prompt_registry/arango/arango_store.py create mode 100644 comps/prompt_registry/arango/config.py create mode 100644 comps/prompt_registry/arango/docker-compose-prompt-registry-arango.yaml create mode 100644 comps/prompt_registry/arango/prompt.py create mode 100644 comps/prompt_registry/arango/requirements.txt diff --git a/comps/feedback_management/README.md b/comps/feedback_management/README.md index 2e68aa413..9cd4b42a5 100644 --- a/comps/feedback_management/README.md +++ b/comps/feedback_management/README.md @@ -20,3 +20,7 @@ The Feedback Management microservice able to support various database backends f ### Feedback Management with MongoDB For more detail, please refer to this [README](./mongo/README.md) + +### Feedback Management with ArangoDB + +For more detail, please refer to this [README](./arango/README.md) diff --git a/comps/feedback_management/arango/README.md b/comps/feedback_management/arango/README.md index e0f070b68..7e9a5f840 100644 --- a/comps/feedback_management/arango/README.md +++ b/comps/feedback_management/arango/README.md @@ -6,9 +6,12 @@ This README provides setup guides and all the necessary information about the Fe ## Setup Environment Variables +See `config.py` for default values. + ```bash export ARANGO_HOST=${ARANGO_HOST} export ARANGO_PORT=${ARANGO_PORT} +export ARANGO_PROTOCOL=${ARANGO_PROTOCOL} export ARANGO_USERNAME=${ARANGO_USERNAME} export ARANGO_PASSWORD=${ARANGO_PASSWORD} export DB_NAME=${DB_NAME} @@ -45,6 +48,7 @@ docker build -t opea/feedbackmanagement-arango-server:latest --build-arg https_p -e no_proxy=$no_proxy \ -e ARANGO_HOST=${ARANGO_HOST} \ -e ARANGO_PORT=${ARANGO_PORT} \ + -e ARANGO_PROTOCOL=${ARANGO_PROTOCOL} \ -e ARANGO_USERNAME=${ARANGO_USERNAME} \ -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ -e DB_NAME=${DB_NAME} \ diff --git a/comps/feedback_management/arango/arango_conn.py b/comps/feedback_management/arango/arango_conn.py index 84ded0428..d6c4b5977 100644 --- a/comps/feedback_management/arango/arango_conn.py +++ b/comps/feedback_management/arango/arango_conn.py @@ -3,11 +3,11 @@ from arango import ArangoClient as PythonArangoClient from arango.database import StandardDatabase -from config import ARANGO_HOST, ARANGO_PASSWORD, ARANGO_PORT, ARANGO_USERNAME, DB_NAME +from config import ARANGO_HOST, ARANGO_PASSWORD, ARANGO_PORT, ARANGO_PROTOCOL, ARANGO_USERNAME, DB_NAME class ArangoClient: - conn_url = f"arangodb://{ARANGO_HOST}:{ARANGO_PORT}/" + conn_url = f"{ARANGO_PROTOCOL}://{ARANGO_HOST}:{ARANGO_PORT}/" @staticmethod def get_db_client() -> StandardDatabase: diff --git a/comps/feedback_management/arango/config.py b/comps/feedback_management/arango/config.py index c332de7e5..bb790eb38 100644 --- a/comps/feedback_management/arango/config.py +++ b/comps/feedback_management/arango/config.py @@ -6,7 +6,8 @@ # ARANGO configuration ARANGO_HOST = os.getenv("ARANGO_HOST", "localhost") ARANGO_PORT = os.getenv("ARANGO_PORT", 8529) +ARANGO_PROTOCOL = os.getenv("ARANGO_PROTOCOL", "http") ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD", "test") DB_NAME = os.getenv("DB_NAME", "OPEA") -COLLECTION_NAME = os.getenv("COLLECTION_NAME", "Feedback") \ No newline at end of file +COLLECTION_NAME = os.getenv("COLLECTION_NAME", "Feedback") diff --git a/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml b/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml index 62ab0df54..8f9b3a85a 100644 --- a/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml +++ b/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml @@ -3,7 +3,7 @@ version: "3" services: - arangodb: + arango: image: arangodb/arangodb:latest container_name: arangodb ports: @@ -26,6 +26,7 @@ services: no_proxy: ${no_proxy} ARANGO_HOST: ${ARANGO_HOST} ARANGO_PORT: ${ARANGO_PORT} + ARANGO_PROTOCOL: ${ARANGO_PROTOCOL} ARANGO_USERNAME: ${ARANGO_USERNAME} ARANGO_PASSWORD: ${ARANGO_PASSWORD} DB_NAME: ${DB_NAME} diff --git a/comps/prompt_registry/README.md b/comps/prompt_registry/README.md index 6332a1a13..a99b1b27b 100644 --- a/comps/prompt_registry/README.md +++ b/comps/prompt_registry/README.md @@ -19,3 +19,7 @@ The Prompt Registry microservice able to support various database backends for s ### Prompt Registry with MongoDB For more detail, please refer to this [README](./mongo/README.md) + +### Prompt Registry with ArangoDB + +For more detail, please refer to this [README](./arango/README.md) diff --git a/comps/prompt_registry/arango/DockerFile b/comps/prompt_registry/arango/DockerFile new file mode 100644 index 000000000..065920205 --- /dev/null +++ b/comps/prompt_registry/arango/DockerFile @@ -0,0 +1,30 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +ENV LANG=C.UTF-8 + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + build-essential \ + libgl1-mesa-glx \ + libjemalloc-dev + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps +COPY requirements.txt /home/user/ + +RUN pip install --no-cache-dir --upgrade pip setuptools && \ + pip install --no-cache-dir -r /home/user/comps/prompt_registry/arango/requirements.txt && \ + pip install --no-cache-dir -r /home/user/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +WORKDIR /home/user/comps/prompt_registry/arango + +ENTRYPOINT ["python", "prompt.py"] diff --git a/comps/prompt_registry/arango/README.md b/comps/prompt_registry/arango/README.md new file mode 100644 index 000000000..e4bdd6c10 --- /dev/null +++ b/comps/prompt_registry/arango/README.md @@ -0,0 +1,120 @@ +# 🧾 Prompt Registry Microservice with ArangoDB + +This README provides setup guides and all the necessary information about the Prompt Registry microservice with ArangoDB database. + +--- + +## Setup Environment Variables + +See `config.py` for default values. + +```bash +export ARANGO_HOST=${ARANGO_HOST} +export ARANGO_PORT=${ARANGO_PORT} +export ARANGO_PROTOCOL=${ARANGO_PROTOCOL} +export ARANGO_USERNAME=${ARANGO_USERNAME} +export ARANGO_PASSWORD=${ARANGO_PASSWORD} +export DB_NAME=${DB_NAME} +export COLLECTION_NAME=${COLLECTION_NAME} +``` + +--- + +## πŸš€Start Microservice with Docker + +### Build Docker Image + +```bash +cd ~/GenAIComps +docker build -t opea/promptregistry-arango-server:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/prompt_registry/arango/Dockerfile . +``` + +### Run Docker with CLI + + +- Run ArangoDB image container + + ```bash + docker run -d -p 8529:8529 --name=arango arangodb/arangodb:latest + ``` + +- Run Prompt Registry microservice + + ```bash + docker run -d -p 6018:6018 \ + --name="promptregistry-arango-server" \ + -e http_proxy=$http_proxy \ + -e https_proxy=$https_proxy \ + -e no_proxy=$no_proxy \ + -e ARANGO_HOST=${ARANGO_HOST} \ + -e ARANGO_PORT=${ARANGO_PORT} \ + -e ARANGO_PROTOCOL=${ARANGO_PROTOCOL} \ + -e ARANGO_USERNAME=${ARANGO_USERNAME} \ + -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ + -e DB_NAME=${DB_NAME} \ + -e COLLECTION_NAME=${COLLECTION_NAME} \ + opea/promptregistry-arango-server:latest + + ``` + +--- + +### βœ… Invoke Microservice + +The Prompt Registry microservice exposes the following API endpoints: + +- Save prompt + + ```bash + curl -X 'POST' \ + http://${host_ip}:6018/v1/prompt/create \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "prompt_text": "test prompt", "user": "test" + }' + ``` + +- Retrieve prompt from database by user + + ```bash + curl -X 'POST' \ + http://${host_ip}:6018/v1/prompt/get \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test"}' + ``` + +- Retrieve prompt from database by prompt_id + + ```bash + curl -X 'POST' \ + http://${host_ip}:6018/v1/prompt/get \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test", "prompt_id":"{_id returned from save prompt route above}"}' + ``` + +- Retrieve relevant prompt by keyword + + ```bash + curl -X 'POST' \ + http://${host_ip}:6018/v1/prompt/get \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test", "prompt_text": "{keyword to search}"}' + ``` + +- Delete prompt by prompt_id + + ```bash + curl -X 'POST' \ + http://${host_ip}:6018/v1/prompt/delete \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test", "prompt_id":"{prompt_id to be deleted}"}' + ``` diff --git a/comps/prompt_registry/arango/arango_conn.py b/comps/prompt_registry/arango/arango_conn.py new file mode 100644 index 000000000..d6c4b5977 --- /dev/null +++ b/comps/prompt_registry/arango/arango_conn.py @@ -0,0 +1,32 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from arango import ArangoClient as PythonArangoClient +from arango.database import StandardDatabase +from config import ARANGO_HOST, ARANGO_PASSWORD, ARANGO_PORT, ARANGO_PROTOCOL, ARANGO_USERNAME, DB_NAME + + +class ArangoClient: + conn_url = f"{ARANGO_PROTOCOL}://{ARANGO_HOST}:{ARANGO_PORT}/" + + @staticmethod + def get_db_client() -> StandardDatabase: + try: + # Create client + client = PythonArangoClient(hosts=ArangoClient.conn_url) + + # First connect to _system database + sys_db = client.db("_system", username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + # Create target database if it doesn't exist + if not sys_db.has_database(DB_NAME): + sys_db.create_database(DB_NAME) + + # Now connect to the target database + db = client.db(DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + return db + + except Exception as e: + print(e) + raise e diff --git a/comps/prompt_registry/arango/arango_store.py b/comps/prompt_registry/arango/arango_store.py new file mode 100644 index 000000000..fb80ccd20 --- /dev/null +++ b/comps/prompt_registry/arango/arango_store.py @@ -0,0 +1,213 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +from arango.exceptions import IndexGetError +from arango_conn import ArangoClient +from config import COLLECTION_NAME +from prompt import PromptCreate +from pydantic import BaseModel + +from comps import CustomLogger + +logger = CustomLogger("arango_store") +logflag = os.getenv("LOGFLAG", False) + + +class PromptStore: + + def __init__( + self, + user: str, + ): + self.user = user + self.inverted_index_exists = False + + def initialize_storage(self) -> None: + self.db_client = ArangoClient.get_db_client() + + if not self.db_client.has_collection(COLLECTION_NAME): + self.db_client.create_collection(COLLECTION_NAME) + + self.collection = self.db_client.collection(COLLECTION_NAME) + + def save_prompt(self, prompt: PromptCreate): + """Stores a new prompt into the storage. + + Args: + prompt: The document to be stored. It should be a Pydantic model. + + Returns: + str: The ID of the inserted prompt. + + Raises: + Exception: If an error occurs while storing the prompt. + """ + try: + model_dump = prompt.model_dump(by_alias=True, mode="json", exclude={"id"}) + + inserted_prompt_data = self.collection.insert(model_dump) + + prompt_id = str(inserted_prompt_data["_key"]) + + return prompt_id + + except Exception as e: + print(e) + raise Exception(e) + + def get_all_prompt_of_user(self) -> list[dict]: + """Retrieves all prompts of a user from the collection. + + Returns: + list[dict] | None: List of dict of prompts of the user, None otherwise. + + Raises: + Exception: If there is an error while retrieving data. + """ + try: + prompt_data_list: list = [] + + # TODO: Clarify if we actually want to omit the `data` field. + # Implemented using MongoDB Prompt Registry as a reference. + cursor = """ + FOR doc IN @@collection + FILTER doc.chat_data.user == @user + RETURN UNSET(doc, "data") + """ + + cursor = self.db_client.aql.execute( + cursor, bind_vars={"@collection": self.collection.name, "user": self.user} + ) + + for document in cursor: + document["id"] = str(document["_key"]) + del document["_id"] + del document["_key"] + del document["_rev"] + + prompt_data_list.append(document) + + return prompt_data_list + + except Exception as e: + print(e) + raise Exception(e) + + def get_user_prompt_by_id(self, prompt_id: str) -> dict | None: + """Retrieves a user prompt from the collection based on the given prompt ID. + + Args: + prompt_id (str): The ID of the prompt to retrieve. + + Returns: + dict | None: The user prompt if found, None otherwise. + + Raises: + KeyError: If document with ID is not found. + Exception: If the user does not match with the document user. + """ + response = self.collection.get(prompt_id) + + if response is None: + raise KeyError(f"Prompt with ID: {prompt_id} not found.") + + if response["user"] != self.user: + raise Exception(f"User mismatch. Prompt with ID: {prompt_id} does not belong to user: {self.user}") + + del response["_id"] + del response["_key"] + del response["_rev"] + + return response + + def prompt_search(self, keyword: str) -> list | None: + """Retrieves prompt from the collection based on keyword provided. + + Args: + keyword (str): The keyword of prompt to search for. + + Returns: + list | None: The list of relevant prompt if found, None otherwise. + + Raises: + Exception: If there is an error while searching data. + """ + try: + index_name = "prompt_text_index" + + if not self.inverted_index_exists: + try: + self.collection.get_index(index_name) + + except IndexGetError: + self.collection.add_inverted_index( + fields=["prompt_text"], + name=index_name, + # TODO: add more kwargs if needed + ) + + self.inverted_index_exists = True + + query = """ + FOR doc IN @@collection + OPTIONS { indexHint: @index_name, forceIndexHint: true } + FILTER PHRASE(doc.prompt_text, @keyword, "text_en") + RETURN doc + """ + + cursor = self.db_client.aql.execute( + query, + bind_vars={ + "@collection": self.collection.name, + "index_name": index_name, + "keyword": keyword, + }, + ) + + serialized_data = [] + for doc in cursor: + doc["id"] = str(doc["_key"]) + del doc["_id"] + del doc["_key"] + del doc["_rev"] + + serialized_data.append(doc) + + return serialized_data + + except Exception as e: + print(e) + raise Exception(e) + + def delete_prompt(self, prompt_id: str) -> bool: + """Delete a prompt from collection by given prompt_id. + + Args: + prompt_id(str): The ID of the prompt to be deleted. + + Returns: + bool: True if prompt is successfully deleted, False otherwise. + + Raises: + KeyError: If the provided feedback_id is invalid: + Exception: If the user does not match with the document user. + Exception: If any errors occurs during delete process. + """ + response = self.collection.get(prompt_id) + + if response is None: + raise KeyError(f"Feedback with ID: {prompt_id} not found.") + + if response["user"] != self.user: + raise Exception(f"User mismatch. Feedback with ID: {prompt_id} does not belong to user: {self.user}") + + try: + response = self.collection.delete(prompt_id) + print(f"Deleted document: {prompt_id} !") + + return True + except Exception as e: + print(e) + raise Exception("Not able to delete the data.") diff --git a/comps/prompt_registry/arango/config.py b/comps/prompt_registry/arango/config.py new file mode 100644 index 000000000..9719f1358 --- /dev/null +++ b/comps/prompt_registry/arango/config.py @@ -0,0 +1,13 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +# ARANGO configuration +ARANGO_HOST = os.getenv("ARANGODB_HOST", "localhost") +ARANGO_PORT = os.getenv("ARANGO_PORT", 8529) +ARANGO_PROTOCOL = os.getenv("ARANGO_PROTOCOL", "http") +ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") +ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD", "test") +DB_NAME = os.getenv("DB_NAME", "OPEA") +COLLECTION_NAME = os.getenv("COLLECTION_NAME", "Prompt") diff --git a/comps/prompt_registry/arango/docker-compose-prompt-registry-arango.yaml b/comps/prompt_registry/arango/docker-compose-prompt-registry-arango.yaml new file mode 100644 index 000000000..b1aee077d --- /dev/null +++ b/comps/prompt_registry/arango/docker-compose-prompt-registry-arango.yaml @@ -0,0 +1,38 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3" +services: + arango: + image: arangodb/arangodb:latest + container_name: arangodb + ports: + - 8529:8529 + environment: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + no_proxy: ${no_proxy} + ARANGO_ROOT_PASSWORD: ${ARANGO_ROOT_PASSWORD} + + promptregistry-arango: + image: opea/promptregistry-arango:latest + container_name: promptregistry-arango-server + ports: + - "6018:6018" + ipc: host + environment: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + no_proxy: ${no_proxy} + ARANGO_HOST: ${ARANGO_HOST} + ARANGO_PORT: ${ARANGO_PORT} + ARANGO_PROTOCOL: ${ARANGO_PROTOCOL} + ARANGO_USERNAME: ${ARANGO_USERNAME} + ARANGO_PASSWORD: ${ARANGO_PASSWORD} + DB_NAME: ${DB_NAME} + COLLECTION_NAME: ${COLLECTION_NAME} + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/prompt_registry/arango/prompt.py b/comps/prompt_registry/arango/prompt.py new file mode 100644 index 000000000..c46e0174c --- /dev/null +++ b/comps/prompt_registry/arango/prompt.py @@ -0,0 +1,148 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os +from typing import Optional + +from arango_store import PromptStore +from pydantic import BaseModel + +from comps import CustomLogger +from comps.cores.mega.micro_service import opea_microservices, register_microservice + +logger = CustomLogger("prompt_arango") +logflag = os.getenv("LOGFLAG", False) + + +class PromptCreate(BaseModel): + """This class represents the data model for creating and storing a new prompt in the database. + + Attributes: + prompt_text (str): The text content of the prompt. + user (str): The user or creator of the prompt. + """ + + prompt_text: str + user: str + + +class PromptId(BaseModel): + """This class represent the data model for retrieve prompt stored in database. + + Attributes: + user (str): The user of the requested prompt. + prompt_id (str): The prompt_id of prompt to be retrieved from database. + """ + + user: str + prompt_id: Optional[str] = None + prompt_text: Optional[str] = None + + +@register_microservice( + name="opea_service@prompt_arango", + endpoint="/v1/prompt/create", + host="0.0.0.0", + input_datatype=PromptCreate, + port=6018, +) +async def create_prompt(prompt: PromptCreate): + """Creates and stores a prompt in prompt store. + + Args: + prompt (PromptCreate): The PromptCreate class object containing the data to be stored. + + Returns: + JSON (PromptResponse): PromptResponse class object, None otherwise. + """ + if logflag: + logger.info(prompt) + + try: + prompt_store = PromptStore(prompt.user) + prompt_store.initialize_storage() + response = prompt_store.save_prompt(prompt) + if logflag: + logger.info(response) + + return response + + except Exception as error: + logger.error(f"An error occurred: {str(error)}") + raise error + + +@register_microservice( + name="opea_service@prompt_arango", + endpoint="/v1/prompt/get", + host="0.0.0.0", + input_datatype=PromptId, + port=6018, +) +async def get_prompt(prompt: PromptId): + """Retrieves prompt from prompt store based on provided PromptId or user. + + Args: + prompt (PromptId): The PromptId object containing user and prompt_id. + + Returns: + JSON: Retrieved prompt data if successful, None otherwise. + """ + if logflag: + logger.info(prompt) + try: + + prompt_store = PromptStore(prompt.user) + prompt_store.initialize_storage() + + if prompt.prompt_id is not None: + response = prompt_store.get_user_prompt_by_id(prompt.prompt_id) + elif prompt.prompt_text: + response = prompt_store.prompt_search(prompt.prompt_text) + else: + response = prompt_store.get_all_prompt_of_user() + if logflag: + logger.info(response) + return response + + except Exception as error: + logger.error(f"An error occurred: {str(error)}") + raise error + + +@register_microservice( + name="opea_service@prompt_arango", + endpoint="/v1/prompt/delete", + host="0.0.0.0", + input_datatype=PromptId, + port=6018, +) +async def delete_prompt(prompt: PromptId): + """Delete a prompt from prompt store by given PromptId. + + Args: + prompt (PromptId): The PromptId object containing user and prompt_id. + + Returns: + Result of deletion if successful, None otherwise. + """ + if logflag: + logger.info(prompt) + try: + prompt_store = PromptStore(prompt.user) + prompt_store.initialize_storage() + if prompt.prompt_id is None: + raise Exception("Prompt id is required.") + else: + response = prompt_store.delete_prompt(prompt.prompt_id) + if logflag: + logger.info(response) + return response + + except Exception as error: + logger.error(f"An error occurred: {str(error)}") + raise error + + +if __name__ == "__main__": + opea_microservices["opea_service@prompt_arango"].start() diff --git a/comps/prompt_registry/arango/requirements.txt b/comps/prompt_registry/arango/requirements.txt new file mode 100644 index 000000000..9e5d0de8e --- /dev/null +++ b/comps/prompt_registry/arango/requirements.txt @@ -0,0 +1 @@ +python-arango \ No newline at end of file From 0e9ed3ba98be04d106ead44c692fc215944997f5 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Tue, 22 Oct 2024 21:30:29 -0400 Subject: [PATCH 05/22] arangodb prep | initial commit --- .gitignore | 1 + ARANGODB_README.md | 33 +++++++++++++++ langchain_test.py | 101 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 135 insertions(+) create mode 100644 ARANGODB_README.md create mode 100644 langchain_test.py diff --git a/.gitignore b/.gitignore index 1d1e0a389..9778bf8f7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ __pycache__ *.egg-info/ .DS_Store +.venv \ No newline at end of file diff --git a/ARANGODB_README.md b/ARANGODB_README.md new file mode 100644 index 000000000..76f4b7db3 --- /dev/null +++ b/ARANGODB_README.md @@ -0,0 +1,33 @@ +Instructions + +0. Create a virtual environment: + +```bash +python -m venv .venv + +source .venv/bin/activate +``` + +1. Install the required packages: + +```bash +pip install python-arango +pip install langchain_openai +pip install git+https://github.com/arangoml/langchain.git@arangodb#subdirectory=libs/community +``` + +2. Provision the ArangoDB with Vector Index image: + +```bash +docker create --name arango-vector -p 8529:8529 -e ARANGO_ROOT_PASSWORD=test jbajic/arangodb-arm:vector-index-preview + +docker start arango-vector +``` + +3. Set your `OPENAI_API_KEY` environment variable (contact Anthony for access) + +4. Run the test script to confirm LangChain is working: + +```bash +python langchain_test.py +``` \ No newline at end of file diff --git a/langchain_test.py b/langchain_test.py new file mode 100644 index 000000000..e33ea1687 --- /dev/null +++ b/langchain_test.py @@ -0,0 +1,101 @@ +from arango import ArangoClient +from langchain_community.chains.graph_qa.arangodb import ArangoGraphQAChain +from langchain_community.embeddings import OpenAIEmbeddings +from langchain_community.graphs.arangodb_graph import ArangoGraph +from langchain_community.graphs.graph_document import GraphDocument, Node, Relationship +from langchain_community.vectorstores.arangodb_vector import ArangoVector +from langchain_core.documents import Document +from langchain_openai import OpenAI + +system_db = ArangoClient().db("_system", password="test", verify=True) +system_db.delete_database("langchain_test", ignore_missing=True) +system_db.create_database("langchain_test") +db = ArangoClient().db("langchain_test", password="test", verify=True) + +#################### +# Test ArangoGraph # +#################### + +# Create nodes +node1 = Node(id="1", type="Person", properties={"name": "John", "age": 30}) +node2 = Node(id="2", type="Person", properties={"name": "Jane", "age": 28}) +node3 = Node(id="3", type="Club", properties={"name": "Karate Club"}) + +# Create relationships +relationship1 = Relationship(source=node1, target=node3, type="MEMBER_OF", properties={"joined_date": "2020-01-01"}) +relationship2 = Relationship(source=node2, target=node3, type="MEMBER_OF", properties={"joined_date": "2019-05-15"}) +relationship3 = Relationship(source=node1, target=node2, type="KNOWS", properties={"since": "2018-03-10"}) + +# Create source document +source_doc = Document( + page_content="John and Jane are members of the Karate Club. They know each other.", + metadata={"source": "club_records"}, +) + +# Create GraphDocument +graph_doc = GraphDocument( + nodes=[node1, node2, node3], relationships=[relationship1, relationship2, relationship3], source=source_doc +) + +arango_graph = ArangoGraph(db=db, include_examples=False) +arango_graph.add_graph_documents([graph_doc], graph_name="NewGraph", include_source=True) + +##################### +# Test ArangoVector # +##################### + +# Add some sample texts +texts = [ + "The quick brown fox jumps over the lazy dog", + "A journey of a thousand miles begins with a single step", + "To be or not to be, that is the question", + "All that glitters is not gold", + "hello what's up", +] + +vector_store = ArangoVector.from_texts( + texts, + OpenAIEmbeddings(), + database=db, + collection_name="vector_test", + index_name="vector_index", + distance_strategy="COSINE", +) + +texts_2 = ["the dog, cat, and mouse are all mammals"] +vector_store.add_texts(texts_2) + +# Perform a similarity search +query = "What animal is mentioned?" +results = vector_store.similarity_search_with_score(query, k=2) + +print("Search results for query:", query) +for doc, score in results: + print(f"Content: {doc.page_content}") + print(f"Metadata: {doc.metadata}") + print(f"Score: {score}") + print("---") + +# Try another query +query2 = "What's a famous Shakespeare quote?" +results2 = vector_store.similarity_search_with_score(query2, k=1) + +print("\nSearch results for query:", query2) +for doc, score in results2: + print(f"Content: {doc.page_content}") + print(f"Metadata: {doc.metadata}") + print(f"Score: {score}") + print("---") + +########################### +# Test ArangoGraphQAChain # +########################### + +llm = OpenAI(temperature=0) +graph = ArangoGraph(db=db, include_examples=False, graph_name="NewGraph") +chain = ArangoGraphQAChain.from_llm(llm, graph=graph, allow_dangerous_requests=True) +chain.verbose = True +chain.execute_aql_query = False +chain.run("What is the name of the club?") +chain.execute_aql_query = True +chain.run("What is the name of the club?") From 7bd5aaacdb43a90f6b8e2cb8ea08b184a03044d2 Mon Sep 17 00:00:00 2001 From: Ajay Kallepalli <72517322+ajaykallepalli@users.noreply.github.com> Date: Mon, 25 Nov 2024 14:28:02 -0800 Subject: [PATCH 06/22] ArangoDB: Feedback management (#11) * initial commit * updating feedback management readme to match arango * Removing comments above import * Working API test and updated readme * Working docker compose file * Docker compose creating network and docker image * code review * update readme & dev yaml * delete dev files * Delete arango_store.py --------- Co-authored-by: Anthony Mahanna --- comps/feedback_management/arango/Dockerfile | 30 +++ comps/feedback_management/arango/README.md | 172 ++++++++++++++++ .../feedback_management/arango/arango_conn.py | 32 +++ .../arango/arango_store.py | 186 ++++++++++++++++++ comps/feedback_management/arango/config.py | 13 ++ .../docker-compose-user-feedback-arango.yaml | 38 ++++ comps/feedback_management/arango/feedback.py | 172 ++++++++++++++++ .../arango/requirements.txt | 1 + 8 files changed, 644 insertions(+) create mode 100644 comps/feedback_management/arango/Dockerfile create mode 100644 comps/feedback_management/arango/README.md create mode 100644 comps/feedback_management/arango/arango_conn.py create mode 100644 comps/feedback_management/arango/arango_store.py create mode 100644 comps/feedback_management/arango/config.py create mode 100644 comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml create mode 100644 comps/feedback_management/arango/feedback.py create mode 100644 comps/feedback_management/arango/requirements.txt diff --git a/comps/feedback_management/arango/Dockerfile b/comps/feedback_management/arango/Dockerfile new file mode 100644 index 000000000..95ac359e6 --- /dev/null +++ b/comps/feedback_management/arango/Dockerfile @@ -0,0 +1,30 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +ENV LANG=C.UTF-8 + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + build-essential \ + libjemalloc-dev \ + libgl1-mesa-glx + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps +COPY requirements.txt /home/user/ + +RUN pip install --no-cache-dir --upgrade pip setuptools && \ + pip install --no-cache-dir -r /home/user/comps/feedback_management/arango/requirements.txt && \ + pip install --no-cache-dir -r /home/user/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +WORKDIR /home/user/comps/feedback_management/arango + +ENTRYPOINT ["python", "feedback.py"] diff --git a/comps/feedback_management/arango/README.md b/comps/feedback_management/arango/README.md new file mode 100644 index 000000000..8eb223ce9 --- /dev/null +++ b/comps/feedback_management/arango/README.md @@ -0,0 +1,172 @@ +# πŸ—¨ Feedback Management Microservice with ArangoDB + +This README provides setup guides and all the necessary information about the Feedback Management microservice with ArangoDB database. + +--- + +## Setup Environment Variables + +```bash +export ARANGO_HOST=${ARANGO_HOST} +export ARANGO_PORT=${ARANGO_PORT} +export ARANGO_USERNAME=${ARANGO_USERNAME} +export ARANGO_PASSWORD=${ARANGO_PASSWORD} +export DB_NAME=${DB_NAME} +export COLLECTION_NAME=${COLLECTION_NAME} +export PROTOCOL=${PROTOCOL} +export PYTHONPATH={Path to base of directory} +``` + +--- + +## πŸš€Start Microservice with Docker + +### Build Docker Image + +```bash +cd ~/GenAIComps +docker build -t opea/feedbackmanagement-arango-server:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/feedback_management/arango/Dockerfile . +``` + +### Run Docker with CLI + +- Run ArangoDB image container + + ```bash + docker run -d -p 8529:8529 --name=arango arangodb/arangodb:latest + ``` + +- Run Feedback Management microservice + + ```bash + docker run -d -p 6016:6016 \ + --name="feedbackmanagement-arango-server" \ + -e http_proxy=$http_proxy \ + -e https_proxy=$https_proxy \ + -e no_proxy=$no_proxy \ + -e ARANGO_HOST=${ARANGO_HOST} \ + -e ARANGO_PORT=${ARANGO_PORT} \ + -e ARANGO_USERNAME=${ARANGO_USERNAME} \ + -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ + -e DB_NAME=${DB_NAME} \ + -e PROTOCOL=${PROTOCOL} \ + -e COLLECTION_NAME=${COLLECTION_NAME} \ + opea/feedbackmanagement-arango-server:latest + + ``` + +--- + +### βœ… Invoke Microservice + +The Feedback Management microservice exposes the following API endpoints: + +- Save feedback data + + ```bash + curl -X 'POST' \ + http://${host_ip}:6016/v1/feedback/create \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "chat_id": "66445d4f71c7eff23d44f78d", + "chat_data": { + "user": "test", + "messages": [ + { + "role": "system", + "content": "You are helpful assistant" + }, + { + "role": "user", + "content": "hi", + "time": "1724915247" + }, + { + "role": "assistant", + "content": "Hi, may I help you?", + "time": "1724915249" + } + ] + }, + "feedback_data": { + "comment": "Moderate", + "rating": 3, + "is_thumbs_up": true + }}' + + + # Take note that chat_id here would be the id get from feedback_arango service + # If you do not wish to maintain chat history via feedback_arango service, you may generate some random uuid for it or just leave it empty. + ``` + +- Update feedback data by feedback_id + + ```bash + curl -X 'POST' \ + http://${host_ip}:6016/v1/feedback/create \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "chat_id": "66445d4f71c7eff23d44f78d", + "chat_data": { + "user": "test", + "messages": [ + { + "role": "system", + "content": "You are helpful assistant" + }, + { + "role": "user", + "content": "hi", + "time": "1724915247" + }, + { + "role": "assistant", + "content": "Hi, may I help you?", + "time": "1724915249" + } + ] + }, + "feedback_data": { + "comment": "Fair and Moderate answer", + "rating": 2, + "is_thumbs_up": true + }, + "feedback_id": "{feedback_id of the data that wanted to update}"}' + + # Just include any feedback_data field value that you wanted to update. + ``` + +- Retrieve feedback data by user + + ```bash + curl -X 'POST' \ + http://${host_ip}:6016/v1/feedback/get \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test"}' + ``` + +- Retrieve feedback data by feedback_id + + ```bash + curl -X 'POST' \ + http://${host_ip}:6016/v1/feedback/get \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test", "feedback_id":"{feedback_id returned from save feedback route above}"}' + ``` + +- Delete feedback data by feedback_id + + ```bash + curl -X 'POST' \ + http://${host_ip}:6016/v1/feedback/delete \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test", "feedback_id":"{feedback_id to be deleted}"}' + ``` diff --git a/comps/feedback_management/arango/arango_conn.py b/comps/feedback_management/arango/arango_conn.py new file mode 100644 index 000000000..f9ac9e411 --- /dev/null +++ b/comps/feedback_management/arango/arango_conn.py @@ -0,0 +1,32 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from arango import ArangoClient as PythonArangoClient +from arango.database import StandardDatabase +from config import ARANGO_HOST, ARANGO_PASSWORD, ARANGO_PORT, ARANGO_USERNAME, DB_NAME, PROTOCOL + + +class ArangoClient: + conn_url = f"{PROTOCOL}://{ARANGO_HOST}:{ARANGO_PORT}/" + + @staticmethod + def get_db_client() -> StandardDatabase: + try: + # Create client + client = PythonArangoClient(hosts=ArangoClient.conn_url) + + # First connect to _system database + sys_db = client.db("_system", username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + # Create target database if it doesn't exist + if not sys_db.has_database(DB_NAME): + sys_db.create_database(DB_NAME) + + # Now connect to the target database + db = client.db(DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + return db + + except Exception as e: + print(e) + raise e diff --git a/comps/feedback_management/arango/arango_store.py b/comps/feedback_management/arango/arango_store.py new file mode 100644 index 000000000..cd22b8078 --- /dev/null +++ b/comps/feedback_management/arango/arango_store.py @@ -0,0 +1,186 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from arango_conn import ArangoClient +from config import COLLECTION_NAME +from pydantic import BaseModel + + +class FeedbackStore: + + def __init__( + self, + user: str, + ): + self.user = user + + def initialize_storage(self) -> None: + self.db_client = ArangoClient.get_db_client() + + if not self.db_client.has_collection(COLLECTION_NAME): + self.db_client.create_collection(COLLECTION_NAME) + + self.collection = self.db_client.collection(COLLECTION_NAME) + + def save_feedback(self, feedback_data: BaseModel) -> str: + """Stores a new feedback data into the storage. + + Args: + feedback_data (object): The document to be stored. + + Returns: + str: The ID of the inserted feedback data. + + Raises: + Exception: If an error occurs while storing the feedback_data. + """ + try: + model_dump = feedback_data.model_dump(by_alias=True, mode="json", exclude={"feedback_id"}) + + inserted_feedback_data = self.collection.insert(model_dump) + + feedback_id = str(inserted_feedback_data["_key"]) + + return feedback_id + + except Exception as e: + print(e) + raise Exception(e) + + def update_feedback(self, feedback_data: BaseModel) -> bool: + """Update a feedback data in the collection with given id. + + Args: + feedback_id (str): The ID of the data to be updated. + updated_data (object): The data to be updated in the entry. + + Returns: + bool: True if the data updated successfully, False otherwise. + + Raises: + KeyError: If the document with ID is not found. + Exception: If the user does not match with the document user. + Exception: If an error occurs while updating the feedback data. + """ + _key = feedback_data.feedback_id + document = self.collection.get(_key) + + if document is None: + raise KeyError(f"Document with ID: {_key} not found.") + + if document["chat_data"]["user"] != self.user: + raise Exception(f"User mismatch. Document with ID: {_key} does not belong to user: {self.user}") + + try: + model_dump = feedback_data.feedback_data.model_dump(by_alias=True, mode="json") + + self.collection.update( + {"_key": _key, "feedback_data": model_dump}, + merge=True, + keep_none=False, + ) + + print(f"Updated document: {_key} !") + + return True + + except Exception as e: + print("Not able to update the data.") + print(e) + raise Exception(e) + + def get_all_feedback_of_user(self) -> list[dict]: + """Retrieves all feedback data of a user from the collection. + + Returns: + list[dict] | None: List of dict of feedback data of the user, None otherwise. + + Raises: + Exception: If there is an error while retrieving data. + """ + try: + feedback_data_list: list = [] + + # TODO: Clarify if we actually want to omit the `feedback_data` field. + # Implemented using MongoDB Feedback Management as a reference. + cursor = """ + FOR doc IN @@collection + FILTER doc.chat_data.user == @user + RETURN UNSET(doc, "feedback_data") + """ + + cursor = self.db_client.aql.execute( + cursor, bind_vars={"@collection": self.collection.name, "user": self.user} + ) + + for document in cursor: + document["feedback_id"] = str(document["_key"]) + del document["_id"] + del document["_key"] + del document["_rev"] + + feedback_data_list.append(document) + + return feedback_data_list + + except Exception as e: + print(e) + raise Exception(e) + + def get_feedback_by_id(self, feedback_id: str) -> dict | None: + """Retrieves a user feedback data from the collection based on the given feedback ID. + + Args: + feedback_id (str): The ID of the feedback data to retrieve. + + Returns: + dict | None: The user's feedback data if found, None otherwise. + + Raises: + KeyError: If document with ID is not found. + Exception: If the user does not match with the document user. + """ + response = self.collection.get(feedback_id) + + if response is None: + raise KeyError(f"Feedback with ID: {feedback_id} not found.") + + if response["chat_data"]["user"] != self.user: + raise Exception(f"User mismatch. Feedback with ID: {feedback_id} does not belong to user: {self.user}") + + del response["_id"] + del response["_key"] + del response["_rev"] + + return response + + def delete_feedback(self, feedback_id: str) -> bool: + """Delete a feedback data from collection by given feedback_id. + + Args: + feedback_id(str): The ID of the feedback data to be deleted. + + Returns: + bool: True if feedback is successfully deleted, False otherwise. + + Raises: + KeyError: If the provided feedback_id is invalid: + Exception: If the user does not match with the document user. + Exception: If any errors occurs during delete process. + """ + response = self.collection.get(feedback_id) + + if response is None: + raise KeyError(f"Feedback with ID: {feedback_id} not found.") + + if response["chat_data"]["user"] != self.user: + raise Exception(f"User mismatch. Feedback with ID: {feedback_id} does not belong to user: {self.user}") + + try: + response = self.collection.delete(feedback_id) + print(f"Deleted document: {feedback_id} !") + + return True + except Exception as e: + print(e) + raise Exception("Not able to delete the data.") diff --git a/comps/feedback_management/arango/config.py b/comps/feedback_management/arango/config.py new file mode 100644 index 000000000..e3272febf --- /dev/null +++ b/comps/feedback_management/arango/config.py @@ -0,0 +1,13 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +# ARANGO configuration +ARANGO_HOST = os.getenv("ARANGO_HOST", "localhost") +ARANGO_PORT = os.getenv("ARANGO_PORT", 8529) +ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") +ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD", "test") +DB_NAME = os.getenv("DB_NAME", "OPEA") +COLLECTION_NAME = os.getenv("COLLECTION_NAME", "Feedback") +PROTOCOL = os.getenv("PROTOCOL", "http") diff --git a/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml b/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml new file mode 100644 index 000000000..f4be0c845 --- /dev/null +++ b/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml @@ -0,0 +1,38 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3" +services: + arangodb: + image: arangodb/arangodb:latest + container_name: arangodb + ports: + - 8529:8529 + environment: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + no_proxy: ${no_proxy} + ARANGO_ROOT_PASSWORD: ${ARANGO_ROOT_PASSWORD} + + feedbackmanagement-arango: + image: opea/feedbackmanagement-arango:latest + container_name: feedbackmanagement-arango-server + ports: + - "6016:6016" + ipc: host + environment: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + no_proxy: ${no_proxy} + ARANGO_HOST: ${ARANGO_HOST} + ARANGO_PORT: ${ARANGO_PORT} + ARANGO_USERNAME: ${ARANGO_USERNAME} + ARANGO_PASSWORD: ${ARANGO_PASSWORD} + PROTOCOL: ${PROTOCOL} + DB_NAME: ${DB_NAME} + COLLECTION_NAME: ${COLLECTION_NAME} + restart: unless-stopped + +networks: + feedback_network: + driver: bridge diff --git a/comps/feedback_management/arango/feedback.py b/comps/feedback_management/arango/feedback.py new file mode 100644 index 000000000..f1efa6f43 --- /dev/null +++ b/comps/feedback_management/arango/feedback.py @@ -0,0 +1,172 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os +from typing import Annotated, Optional + +from arango_store import FeedbackStore +from fastapi import HTTPException +from pydantic import BaseModel, Field + +from comps import CustomLogger +from comps.cores.mega.micro_service import opea_microservices, register_microservice +from comps.cores.proto.api_protocol import ChatCompletionRequest + +logger = CustomLogger("feedback_arango") +logflag = os.getenv("LOGFLAG", False) + + +class FeedbackData(BaseModel): + """This class represents the data model of FeedbackData collected to store in database.". + + Attributes: + is_thumbs_up (bool): True if the response is satisfy, False otherwise. + rating: (int)[Optional]: Score rating. Range from 0 (bad rating) to 5(good rating). + comment (str)[Optional]: Comment given for response. + """ + + is_thumbs_up: bool + rating: Annotated[Optional[int], Field(ge=0, le=5)] = None + comment: Optional[str] = None + + +class ChatFeedback(BaseModel): + """This class represents the model for chat to collect FeedbackData together with ChatCompletionRequest data to store in database. + + Attributes: + chat_data (ChatCompletionRequest): ChatCompletionRequest object containing chat data to be stored. + feedback_data (FeedbackData): FeedbackData object containing feedback data for chat to be stored. + chat_id (str)[Optional]: The chat_id associated to the chat to be store together with feedback data. + feedback_id (str)[Optional]: The feedback_id of feedback data to be retrieved from database. + """ + + chat_data: ChatCompletionRequest + feedback_data: FeedbackData + chat_id: Optional[str] = None + feedback_id: Optional[str] = None + + +class FeedbackId(BaseModel): + """This class represent the data model for retrieve feedback data stored in database. + + Attributes: + user (str): The user of the requested feedback data. + feedback_id (str): The feedback_id of feedback data to be retrieved from database. + """ + + user: str + feedback_id: Optional[str] = None + + +@register_microservice( + name="opea_service@feedback_arango", + endpoint="/v1/feedback/create", + host="0.0.0.0", + input_datatype=FeedbackData, + port=6016, +) +async def create_feedback_data(feedback: ChatFeedback): + """Creates and stores a feedback data in database. + + Args: + feedback (ChatFeedback): The ChatFeedback class object containing feedback data to be stored. + + Returns: + response (str/bool): FeedbackId of the object created in database. True if data update successfully. + """ + if logflag: + logger.info(feedback) + + try: + feedback_store = FeedbackStore(feedback.chat_data.user) + feedback_store.initialize_storage() + if feedback.feedback_id is None: + response = feedback_store.save_feedback(feedback) + else: + response = feedback_store.update_feedback(feedback) + + if logflag: + logger.info(response) + return response + + except Exception as e: + logger.info(f"An error occurred: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + + +@register_microservice( + name="opea_service@feedback_arango", + endpoint="/v1/feedback/get", + host="0.0.0.0", + input_datatype=FeedbackId, + port=6016, +) +async def get_feedback(feedback: FeedbackId): + """Retrieves feedback_data from feedback store based on provided FeedbackId or user. + + Args: + feedback (FeedbackId): The FeedbackId object containing user and feedback_id or chat_id. + + Returns: + JSON: Retrieved feedback data if successful, error otherwise. + """ + if logflag: + logger.info(feedback) + + try: + feedback_store = FeedbackStore(feedback.user) + feedback_store.initialize_storage() + if feedback.feedback_id: + response = feedback_store.get_feedback_by_id(feedback.feedback_id) + else: + response = feedback_store.get_all_feedback_of_user() + + if logflag: + logger.info(response) + + return response + + except Exception as e: + logger.info(f"An error occurred: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + + +@register_microservice( + name="opea_service@feedback_arango", + endpoint="/v1/feedback/delete", + host="0.0.0.0", + input_datatype=FeedbackId, + port=6016, +) +async def delete_feedback(feedback: FeedbackId): + """Delete a feedback data from feedback store by given feedback Id. + + Args: + feedback (FeedbackId): The FeedbackId object containing user and feedback_id or chat_id + + Returns: + Result of deletion if successful, None otherwise. + """ + if logflag: + logger.info(feedback) + + try: + feedback_store = FeedbackStore(feedback.user) + feedback_store.initialize_storage() + if feedback.feedback_id is None: + raise Exception("feedback_id is required.") + else: + response = feedback_store.delete_feedback(feedback.feedback_id) + + if logflag: + logger.info(response) + + return response + + except Exception as e: + logger.info(f"An error occurred: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + + +if __name__ == "__main__": + opea_microservices["opea_service@feedback_arango"].start() diff --git a/comps/feedback_management/arango/requirements.txt b/comps/feedback_management/arango/requirements.txt new file mode 100644 index 000000000..9e5d0de8e --- /dev/null +++ b/comps/feedback_management/arango/requirements.txt @@ -0,0 +1 @@ +python-arango \ No newline at end of file From b6ded9fb924a5d3fbb1645114ddf874d409d7c57 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Mon, 25 Nov 2024 19:05:21 -0500 Subject: [PATCH 07/22] remove: `PROTOCOL` env --- comps/feedback_management/arango/README.md | 2 -- comps/feedback_management/arango/arango_conn.py | 4 ++-- comps/feedback_management/arango/config.py | 3 +-- .../arango/docker-compose-user-feedback-arango.yaml | 3 +-- 4 files changed, 4 insertions(+), 8 deletions(-) diff --git a/comps/feedback_management/arango/README.md b/comps/feedback_management/arango/README.md index 8eb223ce9..e0f070b68 100644 --- a/comps/feedback_management/arango/README.md +++ b/comps/feedback_management/arango/README.md @@ -13,7 +13,6 @@ export ARANGO_USERNAME=${ARANGO_USERNAME} export ARANGO_PASSWORD=${ARANGO_PASSWORD} export DB_NAME=${DB_NAME} export COLLECTION_NAME=${COLLECTION_NAME} -export PROTOCOL=${PROTOCOL} export PYTHONPATH={Path to base of directory} ``` @@ -49,7 +48,6 @@ docker build -t opea/feedbackmanagement-arango-server:latest --build-arg https_p -e ARANGO_USERNAME=${ARANGO_USERNAME} \ -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ -e DB_NAME=${DB_NAME} \ - -e PROTOCOL=${PROTOCOL} \ -e COLLECTION_NAME=${COLLECTION_NAME} \ opea/feedbackmanagement-arango-server:latest diff --git a/comps/feedback_management/arango/arango_conn.py b/comps/feedback_management/arango/arango_conn.py index f9ac9e411..84ded0428 100644 --- a/comps/feedback_management/arango/arango_conn.py +++ b/comps/feedback_management/arango/arango_conn.py @@ -3,11 +3,11 @@ from arango import ArangoClient as PythonArangoClient from arango.database import StandardDatabase -from config import ARANGO_HOST, ARANGO_PASSWORD, ARANGO_PORT, ARANGO_USERNAME, DB_NAME, PROTOCOL +from config import ARANGO_HOST, ARANGO_PASSWORD, ARANGO_PORT, ARANGO_USERNAME, DB_NAME class ArangoClient: - conn_url = f"{PROTOCOL}://{ARANGO_HOST}:{ARANGO_PORT}/" + conn_url = f"arangodb://{ARANGO_HOST}:{ARANGO_PORT}/" @staticmethod def get_db_client() -> StandardDatabase: diff --git a/comps/feedback_management/arango/config.py b/comps/feedback_management/arango/config.py index e3272febf..c332de7e5 100644 --- a/comps/feedback_management/arango/config.py +++ b/comps/feedback_management/arango/config.py @@ -9,5 +9,4 @@ ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD", "test") DB_NAME = os.getenv("DB_NAME", "OPEA") -COLLECTION_NAME = os.getenv("COLLECTION_NAME", "Feedback") -PROTOCOL = os.getenv("PROTOCOL", "http") +COLLECTION_NAME = os.getenv("COLLECTION_NAME", "Feedback") \ No newline at end of file diff --git a/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml b/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml index f4be0c845..62ab0df54 100644 --- a/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml +++ b/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml @@ -28,11 +28,10 @@ services: ARANGO_PORT: ${ARANGO_PORT} ARANGO_USERNAME: ${ARANGO_USERNAME} ARANGO_PASSWORD: ${ARANGO_PASSWORD} - PROTOCOL: ${PROTOCOL} DB_NAME: ${DB_NAME} COLLECTION_NAME: ${COLLECTION_NAME} restart: unless-stopped networks: - feedback_network: + default: driver: bridge From 17467499a6fd22afcdb4d058984af6e8735cdae2 Mon Sep 17 00:00:00 2001 From: SLasyaN Date: Tue, 26 Nov 2024 13:47:41 -0800 Subject: [PATCH 08/22] ArangoDB: PromptRegistry (#8) * Initial commit * remove unnecessary files * code review * update: `prompt_search` * new: `ARANGO_PROTOCOL` * README * cleanup --------- Co-authored-by: lasyasn Co-authored-by: Anthony Mahanna --- comps/feedback_management/README.md | 4 + comps/feedback_management/arango/README.md | 4 + .../feedback_management/arango/arango_conn.py | 4 +- comps/feedback_management/arango/config.py | 3 +- .../docker-compose-user-feedback-arango.yaml | 3 +- comps/prompt_registry/README.md | 4 + comps/prompt_registry/arango/DockerFile | 30 +++ comps/prompt_registry/arango/README.md | 120 ++++++++++ comps/prompt_registry/arango/arango_conn.py | 32 +++ comps/prompt_registry/arango/arango_store.py | 213 ++++++++++++++++++ comps/prompt_registry/arango/config.py | 13 ++ ...docker-compose-prompt-registry-arango.yaml | 38 ++++ comps/prompt_registry/arango/prompt.py | 148 ++++++++++++ comps/prompt_registry/arango/requirements.txt | 1 + 14 files changed, 613 insertions(+), 4 deletions(-) create mode 100644 comps/prompt_registry/arango/DockerFile create mode 100644 comps/prompt_registry/arango/README.md create mode 100644 comps/prompt_registry/arango/arango_conn.py create mode 100644 comps/prompt_registry/arango/arango_store.py create mode 100644 comps/prompt_registry/arango/config.py create mode 100644 comps/prompt_registry/arango/docker-compose-prompt-registry-arango.yaml create mode 100644 comps/prompt_registry/arango/prompt.py create mode 100644 comps/prompt_registry/arango/requirements.txt diff --git a/comps/feedback_management/README.md b/comps/feedback_management/README.md index 2e68aa413..9cd4b42a5 100644 --- a/comps/feedback_management/README.md +++ b/comps/feedback_management/README.md @@ -20,3 +20,7 @@ The Feedback Management microservice able to support various database backends f ### Feedback Management with MongoDB For more detail, please refer to this [README](./mongo/README.md) + +### Feedback Management with ArangoDB + +For more detail, please refer to this [README](./arango/README.md) diff --git a/comps/feedback_management/arango/README.md b/comps/feedback_management/arango/README.md index e0f070b68..7e9a5f840 100644 --- a/comps/feedback_management/arango/README.md +++ b/comps/feedback_management/arango/README.md @@ -6,9 +6,12 @@ This README provides setup guides and all the necessary information about the Fe ## Setup Environment Variables +See `config.py` for default values. + ```bash export ARANGO_HOST=${ARANGO_HOST} export ARANGO_PORT=${ARANGO_PORT} +export ARANGO_PROTOCOL=${ARANGO_PROTOCOL} export ARANGO_USERNAME=${ARANGO_USERNAME} export ARANGO_PASSWORD=${ARANGO_PASSWORD} export DB_NAME=${DB_NAME} @@ -45,6 +48,7 @@ docker build -t opea/feedbackmanagement-arango-server:latest --build-arg https_p -e no_proxy=$no_proxy \ -e ARANGO_HOST=${ARANGO_HOST} \ -e ARANGO_PORT=${ARANGO_PORT} \ + -e ARANGO_PROTOCOL=${ARANGO_PROTOCOL} \ -e ARANGO_USERNAME=${ARANGO_USERNAME} \ -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ -e DB_NAME=${DB_NAME} \ diff --git a/comps/feedback_management/arango/arango_conn.py b/comps/feedback_management/arango/arango_conn.py index 84ded0428..d6c4b5977 100644 --- a/comps/feedback_management/arango/arango_conn.py +++ b/comps/feedback_management/arango/arango_conn.py @@ -3,11 +3,11 @@ from arango import ArangoClient as PythonArangoClient from arango.database import StandardDatabase -from config import ARANGO_HOST, ARANGO_PASSWORD, ARANGO_PORT, ARANGO_USERNAME, DB_NAME +from config import ARANGO_HOST, ARANGO_PASSWORD, ARANGO_PORT, ARANGO_PROTOCOL, ARANGO_USERNAME, DB_NAME class ArangoClient: - conn_url = f"arangodb://{ARANGO_HOST}:{ARANGO_PORT}/" + conn_url = f"{ARANGO_PROTOCOL}://{ARANGO_HOST}:{ARANGO_PORT}/" @staticmethod def get_db_client() -> StandardDatabase: diff --git a/comps/feedback_management/arango/config.py b/comps/feedback_management/arango/config.py index c332de7e5..bb790eb38 100644 --- a/comps/feedback_management/arango/config.py +++ b/comps/feedback_management/arango/config.py @@ -6,7 +6,8 @@ # ARANGO configuration ARANGO_HOST = os.getenv("ARANGO_HOST", "localhost") ARANGO_PORT = os.getenv("ARANGO_PORT", 8529) +ARANGO_PROTOCOL = os.getenv("ARANGO_PROTOCOL", "http") ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD", "test") DB_NAME = os.getenv("DB_NAME", "OPEA") -COLLECTION_NAME = os.getenv("COLLECTION_NAME", "Feedback") \ No newline at end of file +COLLECTION_NAME = os.getenv("COLLECTION_NAME", "Feedback") diff --git a/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml b/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml index 62ab0df54..8f9b3a85a 100644 --- a/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml +++ b/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml @@ -3,7 +3,7 @@ version: "3" services: - arangodb: + arango: image: arangodb/arangodb:latest container_name: arangodb ports: @@ -26,6 +26,7 @@ services: no_proxy: ${no_proxy} ARANGO_HOST: ${ARANGO_HOST} ARANGO_PORT: ${ARANGO_PORT} + ARANGO_PROTOCOL: ${ARANGO_PROTOCOL} ARANGO_USERNAME: ${ARANGO_USERNAME} ARANGO_PASSWORD: ${ARANGO_PASSWORD} DB_NAME: ${DB_NAME} diff --git a/comps/prompt_registry/README.md b/comps/prompt_registry/README.md index 6332a1a13..a99b1b27b 100644 --- a/comps/prompt_registry/README.md +++ b/comps/prompt_registry/README.md @@ -19,3 +19,7 @@ The Prompt Registry microservice able to support various database backends for s ### Prompt Registry with MongoDB For more detail, please refer to this [README](./mongo/README.md) + +### Prompt Registry with ArangoDB + +For more detail, please refer to this [README](./arango/README.md) diff --git a/comps/prompt_registry/arango/DockerFile b/comps/prompt_registry/arango/DockerFile new file mode 100644 index 000000000..065920205 --- /dev/null +++ b/comps/prompt_registry/arango/DockerFile @@ -0,0 +1,30 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +ENV LANG=C.UTF-8 + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + build-essential \ + libgl1-mesa-glx \ + libjemalloc-dev + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps +COPY requirements.txt /home/user/ + +RUN pip install --no-cache-dir --upgrade pip setuptools && \ + pip install --no-cache-dir -r /home/user/comps/prompt_registry/arango/requirements.txt && \ + pip install --no-cache-dir -r /home/user/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +WORKDIR /home/user/comps/prompt_registry/arango + +ENTRYPOINT ["python", "prompt.py"] diff --git a/comps/prompt_registry/arango/README.md b/comps/prompt_registry/arango/README.md new file mode 100644 index 000000000..e4bdd6c10 --- /dev/null +++ b/comps/prompt_registry/arango/README.md @@ -0,0 +1,120 @@ +# 🧾 Prompt Registry Microservice with ArangoDB + +This README provides setup guides and all the necessary information about the Prompt Registry microservice with ArangoDB database. + +--- + +## Setup Environment Variables + +See `config.py` for default values. + +```bash +export ARANGO_HOST=${ARANGO_HOST} +export ARANGO_PORT=${ARANGO_PORT} +export ARANGO_PROTOCOL=${ARANGO_PROTOCOL} +export ARANGO_USERNAME=${ARANGO_USERNAME} +export ARANGO_PASSWORD=${ARANGO_PASSWORD} +export DB_NAME=${DB_NAME} +export COLLECTION_NAME=${COLLECTION_NAME} +``` + +--- + +## πŸš€Start Microservice with Docker + +### Build Docker Image + +```bash +cd ~/GenAIComps +docker build -t opea/promptregistry-arango-server:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/prompt_registry/arango/Dockerfile . +``` + +### Run Docker with CLI + + +- Run ArangoDB image container + + ```bash + docker run -d -p 8529:8529 --name=arango arangodb/arangodb:latest + ``` + +- Run Prompt Registry microservice + + ```bash + docker run -d -p 6018:6018 \ + --name="promptregistry-arango-server" \ + -e http_proxy=$http_proxy \ + -e https_proxy=$https_proxy \ + -e no_proxy=$no_proxy \ + -e ARANGO_HOST=${ARANGO_HOST} \ + -e ARANGO_PORT=${ARANGO_PORT} \ + -e ARANGO_PROTOCOL=${ARANGO_PROTOCOL} \ + -e ARANGO_USERNAME=${ARANGO_USERNAME} \ + -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ + -e DB_NAME=${DB_NAME} \ + -e COLLECTION_NAME=${COLLECTION_NAME} \ + opea/promptregistry-arango-server:latest + + ``` + +--- + +### βœ… Invoke Microservice + +The Prompt Registry microservice exposes the following API endpoints: + +- Save prompt + + ```bash + curl -X 'POST' \ + http://${host_ip}:6018/v1/prompt/create \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "prompt_text": "test prompt", "user": "test" + }' + ``` + +- Retrieve prompt from database by user + + ```bash + curl -X 'POST' \ + http://${host_ip}:6018/v1/prompt/get \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test"}' + ``` + +- Retrieve prompt from database by prompt_id + + ```bash + curl -X 'POST' \ + http://${host_ip}:6018/v1/prompt/get \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test", "prompt_id":"{_id returned from save prompt route above}"}' + ``` + +- Retrieve relevant prompt by keyword + + ```bash + curl -X 'POST' \ + http://${host_ip}:6018/v1/prompt/get \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test", "prompt_text": "{keyword to search}"}' + ``` + +- Delete prompt by prompt_id + + ```bash + curl -X 'POST' \ + http://${host_ip}:6018/v1/prompt/delete \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test", "prompt_id":"{prompt_id to be deleted}"}' + ``` diff --git a/comps/prompt_registry/arango/arango_conn.py b/comps/prompt_registry/arango/arango_conn.py new file mode 100644 index 000000000..d6c4b5977 --- /dev/null +++ b/comps/prompt_registry/arango/arango_conn.py @@ -0,0 +1,32 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from arango import ArangoClient as PythonArangoClient +from arango.database import StandardDatabase +from config import ARANGO_HOST, ARANGO_PASSWORD, ARANGO_PORT, ARANGO_PROTOCOL, ARANGO_USERNAME, DB_NAME + + +class ArangoClient: + conn_url = f"{ARANGO_PROTOCOL}://{ARANGO_HOST}:{ARANGO_PORT}/" + + @staticmethod + def get_db_client() -> StandardDatabase: + try: + # Create client + client = PythonArangoClient(hosts=ArangoClient.conn_url) + + # First connect to _system database + sys_db = client.db("_system", username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + # Create target database if it doesn't exist + if not sys_db.has_database(DB_NAME): + sys_db.create_database(DB_NAME) + + # Now connect to the target database + db = client.db(DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + return db + + except Exception as e: + print(e) + raise e diff --git a/comps/prompt_registry/arango/arango_store.py b/comps/prompt_registry/arango/arango_store.py new file mode 100644 index 000000000..fb80ccd20 --- /dev/null +++ b/comps/prompt_registry/arango/arango_store.py @@ -0,0 +1,213 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +from arango.exceptions import IndexGetError +from arango_conn import ArangoClient +from config import COLLECTION_NAME +from prompt import PromptCreate +from pydantic import BaseModel + +from comps import CustomLogger + +logger = CustomLogger("arango_store") +logflag = os.getenv("LOGFLAG", False) + + +class PromptStore: + + def __init__( + self, + user: str, + ): + self.user = user + self.inverted_index_exists = False + + def initialize_storage(self) -> None: + self.db_client = ArangoClient.get_db_client() + + if not self.db_client.has_collection(COLLECTION_NAME): + self.db_client.create_collection(COLLECTION_NAME) + + self.collection = self.db_client.collection(COLLECTION_NAME) + + def save_prompt(self, prompt: PromptCreate): + """Stores a new prompt into the storage. + + Args: + prompt: The document to be stored. It should be a Pydantic model. + + Returns: + str: The ID of the inserted prompt. + + Raises: + Exception: If an error occurs while storing the prompt. + """ + try: + model_dump = prompt.model_dump(by_alias=True, mode="json", exclude={"id"}) + + inserted_prompt_data = self.collection.insert(model_dump) + + prompt_id = str(inserted_prompt_data["_key"]) + + return prompt_id + + except Exception as e: + print(e) + raise Exception(e) + + def get_all_prompt_of_user(self) -> list[dict]: + """Retrieves all prompts of a user from the collection. + + Returns: + list[dict] | None: List of dict of prompts of the user, None otherwise. + + Raises: + Exception: If there is an error while retrieving data. + """ + try: + prompt_data_list: list = [] + + # TODO: Clarify if we actually want to omit the `data` field. + # Implemented using MongoDB Prompt Registry as a reference. + cursor = """ + FOR doc IN @@collection + FILTER doc.chat_data.user == @user + RETURN UNSET(doc, "data") + """ + + cursor = self.db_client.aql.execute( + cursor, bind_vars={"@collection": self.collection.name, "user": self.user} + ) + + for document in cursor: + document["id"] = str(document["_key"]) + del document["_id"] + del document["_key"] + del document["_rev"] + + prompt_data_list.append(document) + + return prompt_data_list + + except Exception as e: + print(e) + raise Exception(e) + + def get_user_prompt_by_id(self, prompt_id: str) -> dict | None: + """Retrieves a user prompt from the collection based on the given prompt ID. + + Args: + prompt_id (str): The ID of the prompt to retrieve. + + Returns: + dict | None: The user prompt if found, None otherwise. + + Raises: + KeyError: If document with ID is not found. + Exception: If the user does not match with the document user. + """ + response = self.collection.get(prompt_id) + + if response is None: + raise KeyError(f"Prompt with ID: {prompt_id} not found.") + + if response["user"] != self.user: + raise Exception(f"User mismatch. Prompt with ID: {prompt_id} does not belong to user: {self.user}") + + del response["_id"] + del response["_key"] + del response["_rev"] + + return response + + def prompt_search(self, keyword: str) -> list | None: + """Retrieves prompt from the collection based on keyword provided. + + Args: + keyword (str): The keyword of prompt to search for. + + Returns: + list | None: The list of relevant prompt if found, None otherwise. + + Raises: + Exception: If there is an error while searching data. + """ + try: + index_name = "prompt_text_index" + + if not self.inverted_index_exists: + try: + self.collection.get_index(index_name) + + except IndexGetError: + self.collection.add_inverted_index( + fields=["prompt_text"], + name=index_name, + # TODO: add more kwargs if needed + ) + + self.inverted_index_exists = True + + query = """ + FOR doc IN @@collection + OPTIONS { indexHint: @index_name, forceIndexHint: true } + FILTER PHRASE(doc.prompt_text, @keyword, "text_en") + RETURN doc + """ + + cursor = self.db_client.aql.execute( + query, + bind_vars={ + "@collection": self.collection.name, + "index_name": index_name, + "keyword": keyword, + }, + ) + + serialized_data = [] + for doc in cursor: + doc["id"] = str(doc["_key"]) + del doc["_id"] + del doc["_key"] + del doc["_rev"] + + serialized_data.append(doc) + + return serialized_data + + except Exception as e: + print(e) + raise Exception(e) + + def delete_prompt(self, prompt_id: str) -> bool: + """Delete a prompt from collection by given prompt_id. + + Args: + prompt_id(str): The ID of the prompt to be deleted. + + Returns: + bool: True if prompt is successfully deleted, False otherwise. + + Raises: + KeyError: If the provided feedback_id is invalid: + Exception: If the user does not match with the document user. + Exception: If any errors occurs during delete process. + """ + response = self.collection.get(prompt_id) + + if response is None: + raise KeyError(f"Feedback with ID: {prompt_id} not found.") + + if response["user"] != self.user: + raise Exception(f"User mismatch. Feedback with ID: {prompt_id} does not belong to user: {self.user}") + + try: + response = self.collection.delete(prompt_id) + print(f"Deleted document: {prompt_id} !") + + return True + except Exception as e: + print(e) + raise Exception("Not able to delete the data.") diff --git a/comps/prompt_registry/arango/config.py b/comps/prompt_registry/arango/config.py new file mode 100644 index 000000000..9719f1358 --- /dev/null +++ b/comps/prompt_registry/arango/config.py @@ -0,0 +1,13 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +# ARANGO configuration +ARANGO_HOST = os.getenv("ARANGODB_HOST", "localhost") +ARANGO_PORT = os.getenv("ARANGO_PORT", 8529) +ARANGO_PROTOCOL = os.getenv("ARANGO_PROTOCOL", "http") +ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") +ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD", "test") +DB_NAME = os.getenv("DB_NAME", "OPEA") +COLLECTION_NAME = os.getenv("COLLECTION_NAME", "Prompt") diff --git a/comps/prompt_registry/arango/docker-compose-prompt-registry-arango.yaml b/comps/prompt_registry/arango/docker-compose-prompt-registry-arango.yaml new file mode 100644 index 000000000..b1aee077d --- /dev/null +++ b/comps/prompt_registry/arango/docker-compose-prompt-registry-arango.yaml @@ -0,0 +1,38 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3" +services: + arango: + image: arangodb/arangodb:latest + container_name: arangodb + ports: + - 8529:8529 + environment: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + no_proxy: ${no_proxy} + ARANGO_ROOT_PASSWORD: ${ARANGO_ROOT_PASSWORD} + + promptregistry-arango: + image: opea/promptregistry-arango:latest + container_name: promptregistry-arango-server + ports: + - "6018:6018" + ipc: host + environment: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + no_proxy: ${no_proxy} + ARANGO_HOST: ${ARANGO_HOST} + ARANGO_PORT: ${ARANGO_PORT} + ARANGO_PROTOCOL: ${ARANGO_PROTOCOL} + ARANGO_USERNAME: ${ARANGO_USERNAME} + ARANGO_PASSWORD: ${ARANGO_PASSWORD} + DB_NAME: ${DB_NAME} + COLLECTION_NAME: ${COLLECTION_NAME} + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/prompt_registry/arango/prompt.py b/comps/prompt_registry/arango/prompt.py new file mode 100644 index 000000000..c46e0174c --- /dev/null +++ b/comps/prompt_registry/arango/prompt.py @@ -0,0 +1,148 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os +from typing import Optional + +from arango_store import PromptStore +from pydantic import BaseModel + +from comps import CustomLogger +from comps.cores.mega.micro_service import opea_microservices, register_microservice + +logger = CustomLogger("prompt_arango") +logflag = os.getenv("LOGFLAG", False) + + +class PromptCreate(BaseModel): + """This class represents the data model for creating and storing a new prompt in the database. + + Attributes: + prompt_text (str): The text content of the prompt. + user (str): The user or creator of the prompt. + """ + + prompt_text: str + user: str + + +class PromptId(BaseModel): + """This class represent the data model for retrieve prompt stored in database. + + Attributes: + user (str): The user of the requested prompt. + prompt_id (str): The prompt_id of prompt to be retrieved from database. + """ + + user: str + prompt_id: Optional[str] = None + prompt_text: Optional[str] = None + + +@register_microservice( + name="opea_service@prompt_arango", + endpoint="/v1/prompt/create", + host="0.0.0.0", + input_datatype=PromptCreate, + port=6018, +) +async def create_prompt(prompt: PromptCreate): + """Creates and stores a prompt in prompt store. + + Args: + prompt (PromptCreate): The PromptCreate class object containing the data to be stored. + + Returns: + JSON (PromptResponse): PromptResponse class object, None otherwise. + """ + if logflag: + logger.info(prompt) + + try: + prompt_store = PromptStore(prompt.user) + prompt_store.initialize_storage() + response = prompt_store.save_prompt(prompt) + if logflag: + logger.info(response) + + return response + + except Exception as error: + logger.error(f"An error occurred: {str(error)}") + raise error + + +@register_microservice( + name="opea_service@prompt_arango", + endpoint="/v1/prompt/get", + host="0.0.0.0", + input_datatype=PromptId, + port=6018, +) +async def get_prompt(prompt: PromptId): + """Retrieves prompt from prompt store based on provided PromptId or user. + + Args: + prompt (PromptId): The PromptId object containing user and prompt_id. + + Returns: + JSON: Retrieved prompt data if successful, None otherwise. + """ + if logflag: + logger.info(prompt) + try: + + prompt_store = PromptStore(prompt.user) + prompt_store.initialize_storage() + + if prompt.prompt_id is not None: + response = prompt_store.get_user_prompt_by_id(prompt.prompt_id) + elif prompt.prompt_text: + response = prompt_store.prompt_search(prompt.prompt_text) + else: + response = prompt_store.get_all_prompt_of_user() + if logflag: + logger.info(response) + return response + + except Exception as error: + logger.error(f"An error occurred: {str(error)}") + raise error + + +@register_microservice( + name="opea_service@prompt_arango", + endpoint="/v1/prompt/delete", + host="0.0.0.0", + input_datatype=PromptId, + port=6018, +) +async def delete_prompt(prompt: PromptId): + """Delete a prompt from prompt store by given PromptId. + + Args: + prompt (PromptId): The PromptId object containing user and prompt_id. + + Returns: + Result of deletion if successful, None otherwise. + """ + if logflag: + logger.info(prompt) + try: + prompt_store = PromptStore(prompt.user) + prompt_store.initialize_storage() + if prompt.prompt_id is None: + raise Exception("Prompt id is required.") + else: + response = prompt_store.delete_prompt(prompt.prompt_id) + if logflag: + logger.info(response) + return response + + except Exception as error: + logger.error(f"An error occurred: {str(error)}") + raise error + + +if __name__ == "__main__": + opea_microservices["opea_service@prompt_arango"].start() diff --git a/comps/prompt_registry/arango/requirements.txt b/comps/prompt_registry/arango/requirements.txt new file mode 100644 index 000000000..9e5d0de8e --- /dev/null +++ b/comps/prompt_registry/arango/requirements.txt @@ -0,0 +1 @@ +python-arango \ No newline at end of file From bf413276134fd8ca942049239caa071b75660876 Mon Sep 17 00:00:00 2001 From: Ajay Kallepalli <72517322+ajaykallepalli@users.noreply.github.com> Date: Tue, 26 Nov 2024 14:59:41 -0800 Subject: [PATCH 09/22] ArangoDB: Chathistory (#10) * Initial chat history implementation without API and docker implementation * make copy and remove async * API functionality matching MongoDB implementation Working API functionality, update to dockerfile required, and additional checks when updating document required. * Delete temp.py * Push changes and reset repo * Async definitions working in curl calls, updated read me to ArangoDB setup * Working docker container with network * Removing need for network to be created before docker compose * Cleanup async files and backup files * code review * fix: typo * revert mongo changes --------- Co-authored-by: Anthony Mahanna --- .gitignore | 3 +- comps/chathistory/arango/Dockerfile | 30 +++ comps/chathistory/arango/README.md | 123 ++++++++++++ comps/chathistory/arango/arango_conn.py | 32 +++ comps/chathistory/arango/arango_store.py | 186 ++++++++++++++++++ comps/chathistory/arango/chat.py | 146 ++++++++++++++ comps/chathistory/arango/config.py | 13 ++ .../docker-compose-chathistory-arango.yaml | 38 ++++ comps/chathistory/arango/requirements.txt | 1 + comps/prompt_registry/arango/config.py | 2 +- 10 files changed, 572 insertions(+), 2 deletions(-) create mode 100644 comps/chathistory/arango/Dockerfile create mode 100644 comps/chathistory/arango/README.md create mode 100644 comps/chathistory/arango/arango_conn.py create mode 100644 comps/chathistory/arango/arango_store.py create mode 100644 comps/chathistory/arango/chat.py create mode 100644 comps/chathistory/arango/config.py create mode 100644 comps/chathistory/arango/docker-compose-chathistory-arango.yaml create mode 100644 comps/chathistory/arango/requirements.txt diff --git a/.gitignore b/.gitignore index 9778bf8f7..3a428754d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ __pycache__ *.egg-info/ .DS_Store -.venv \ No newline at end of file +.venv +venv/ diff --git a/comps/chathistory/arango/Dockerfile b/comps/chathistory/arango/Dockerfile new file mode 100644 index 000000000..f402e5526 --- /dev/null +++ b/comps/chathistory/arango/Dockerfile @@ -0,0 +1,30 @@ +ο»Ώ# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +ENV LANG=C.UTF-8 + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + build-essential \ + libjemalloc-dev \ + libgl1-mesa-glx + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps +COPY requirements.txt /home/user/ + +RUN pip install --no-cache-dir --upgrade pip setuptools && \ + pip install --no-cache-dir -r /home/user/comps/chathistory/arango/requirements.txt && \ + pip install --no-cache-dir -r /home/user/requirements.txt + +ENV PYTHONPATH=/home/user + +WORKDIR /home/user/comps/chathistory/mongo + +ENTRYPOINT ["python", "chat.py"] diff --git a/comps/chathistory/arango/README.md b/comps/chathistory/arango/README.md new file mode 100644 index 000000000..428a65255 --- /dev/null +++ b/comps/chathistory/arango/README.md @@ -0,0 +1,123 @@ +# πŸ“ Chat History Microservice with ArangoDB + +This README provides setup guides and all the necessary information about the Chat History microservice with ArangoDB database. + +--- + +## Setup Environment Variables + +See `config.py` for default values. + +```bash +export ARANGO_HOST=${ARANGO_HOST} +export ARANGO_PORT=${ARANGO_PORT} +export ARANGO_PROTOCOL=${ARANGO_PROTOCOL} +export ARANGO_USERNAME=${ARANGO_USERNAME} +export ARANGO_PASSWORD=${ARANGO_PASSWORD} +export DB_NAME=${DB_NAME} +export COLLECTION_NAME=${COLLECTION_NAME} +``` + +--- + +## πŸš€Start Microservice with Docker + +### Build Docker Image + +```bash +cd ../../../../ +docker build -t opea/chathistory-arango-server:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/chathistory/arango/Dockerfile . +``` + +### Run Docker with CLI + +- Run ArangoDB image container + + ```bash + docker run -d -p 8529:8529 --name=arango arangodb/arangodb:latest + ``` + +- Run the Chat History microservice + + ```bash + docker run -p 6012:6012 \ + -e http_proxy=$http_proxy \ + -e https_proxy=$https_proxy \ + -e no_proxy=$no_proxy \ + -e ARANGO_HOST=${ARANGO_HOST} \ + -e ARANGO_PORT=${ARANGO_PORT} \ + -e ARANGO_PROTOCOL=${ARANGO_PROTOCOL} \ + -e ARANGO_USERNAME=${ARANGO_USERNAME} \ + -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ + -e DB_NAME=${DB_NAME} \ + -e COLLECTION_NAME=${COLLECTION_NAME} \ + opea/chathistory-arango-server:latest + ``` + +--- + +## βœ… Invoke Microservice + +The Chat History microservice exposes the following API endpoints: + +- Create new chat conversation + + ```bash + curl -X 'POST' \ + http://${host_ip}:6012/v1/chathistory/create \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "data": { + "messages": "test Messages", "user": "test" + } + }' + ``` + +- Get all the Conversations for a user + + ```bash + curl -X 'POST' \ + http://${host_ip}:6012/v1/chathistory/get \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test"}' + ``` + +- Get a specific conversation by id. + + ```bash + curl -X 'POST' \ + http://${host_ip}:6012/v1/chathistory/get \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test", "id":"48918"}' + ``` + +- Update the conversation by id. + + ```bash + curl -X 'POST' \ + http://${host_ip}:6012/v1/chathistory/create \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "data": { + "messages": "test Messages Update", "user": "test" + }, + "id":"48918" + }' + ``` + +- Delete a stored conversation. + + ```bash + curl -X 'POST' \ + http://${host_ip}:6012/v1/chathistory/delete \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test", "id":"48918"}' + ``` diff --git a/comps/chathistory/arango/arango_conn.py b/comps/chathistory/arango/arango_conn.py new file mode 100644 index 000000000..d6c4b5977 --- /dev/null +++ b/comps/chathistory/arango/arango_conn.py @@ -0,0 +1,32 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from arango import ArangoClient as PythonArangoClient +from arango.database import StandardDatabase +from config import ARANGO_HOST, ARANGO_PASSWORD, ARANGO_PORT, ARANGO_PROTOCOL, ARANGO_USERNAME, DB_NAME + + +class ArangoClient: + conn_url = f"{ARANGO_PROTOCOL}://{ARANGO_HOST}:{ARANGO_PORT}/" + + @staticmethod + def get_db_client() -> StandardDatabase: + try: + # Create client + client = PythonArangoClient(hosts=ArangoClient.conn_url) + + # First connect to _system database + sys_db = client.db("_system", username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + # Create target database if it doesn't exist + if not sys_db.has_database(DB_NAME): + sys_db.create_database(DB_NAME) + + # Now connect to the target database + db = client.db(DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + return db + + except Exception as e: + print(e) + raise e diff --git a/comps/chathistory/arango/arango_store.py b/comps/chathistory/arango/arango_store.py new file mode 100644 index 000000000..8ab6928eb --- /dev/null +++ b/comps/chathistory/arango/arango_store.py @@ -0,0 +1,186 @@ +from typing import Any + +from arango_conn import ArangoClient +from config import COLLECTION_NAME +from pydantic import BaseModel + + +class DocumentStore: + + def __init__( + self, + user: str, + ): + self.user = user + + def initialize_storage(self) -> None: + self.db_client = ArangoClient.get_db_client() + + if not self.db_client.has_collection(COLLECTION_NAME): + self.db_client.create_collection(COLLECTION_NAME) + + self.collection = self.db_client.collection(COLLECTION_NAME) + + def save_document(self, document: BaseModel) -> str: + """Stores a new document into the storage. + + Args: + document: The document to be stored. It should be a Pydantic model. + + Returns: + str: The ID of the inserted document. + + Raises: + Exception: If an error occurs while storing the document. + """ + try: + model_dump = document.model_dump(by_alias=True, mode="json", exclude={"id"}) + + inserted_document = self.collection.insert(model_dump) + + document_id = str(inserted_document["_key"]) + + return document_id + + except Exception as e: + print(e) + raise Exception(e) + + def update_document(self, document_id: str, updated_data: BaseModel, first_query: Any) -> str: + """Updates a document in the collection with the given document_id. + + Args: + document_id (str): The ID of the document to update. + updated_data (object): The updated data to be set in the document. + first_query (object): The first query to be set in the document. + + Returns: + bool: True if the document was successfully updated, False otherwise. + + Raises: + KeyError: If the document with ID is not found. + Exception: If the user does not match with the document user. + Exception: If an error occurs while updating the document data. + """ + document = self.collection.get(document_id) + + if document is None: + raise Exception(f"Unable to find Document {document_id}") + + if document["data"]["user"] != self.user: + raise Exception(f"User {self.user} is not allowed to update Document {document_id}.") + + try: + self.collection.update( + { + "_key": document_id, + "data": updated_data.model_dump(by_alias=True, mode="json"), + "first_query": first_query, + }, + merge=True, + keep_none=True, + ) + + print(f"Updated document: {document_id} !") + + return True + + except Exception as e: + print("Not able to update the data.") + print(e) + raise Exception(e) + + def get_all_documents_of_user(self) -> list[dict]: + """Retrieves all documents of a specific user from the collection. + + Returns: + A list of dictionaries representing the conversation documents. + Raises: + Exception: If there is an error while retrieving the documents. + """ + try: + document_list: list = [] + + # TODO: Clarify if we actually want to omit the `data` field. + # Implemented using MongoDB Feedback Management as a reference. + cursor = """ + FOR doc IN @@collection + FILTER doc.data.user == @user + RETURN UNSET(doc, "data") + """ + + cursor = self.db_client.aql.execute( + cursor, bind_vars={"@collection": self.collection.name, "user": self.user} + ) + + for document in cursor: + document["id"] = str(document["_key"]) + del document["_id"] + del document["_key"] + del document["_rev"] + + document_list.append(document) + + return document_list + + except Exception as e: + print(e) + raise Exception(e) + + def get_user_documents_by_id(self, document_id: str) -> dict | None: + """Retrieves a user document from the collection based on the given document ID. + + Args: + document_id (str): The ID of the document to retrieve. + + Returns: + dict | None: The user document if found, None otherwise. + + Raises: + KeyError: If document with ID is not found. + Exception: If the user does not match with the document user. + """ + response = self.collection.get(document_id) + + if response is None: + raise KeyError(f"Document with ID: {document_id} not found.") + + if response["data"]["user"] != self.user: + raise Exception(f"User mismatch. Document with ID: {document_id} does not belong to user: {self.user}") + + del response["_id"] + del response["_key"] + del response["_rev"] + + return response + + def delete_document(self, document_id: str) -> str: + """Deletes a document from the collection based on the provided document ID. + + Args: + document_id (str): The ID of the document to be deleted. + + Returns: + bool: True if the document is successfully deleted, False otherwise. + + Raises: + KeyError: If the provided document_id is invalid: + Exception: If the user does not match with the document user. + Exception: If any errors occurs during delete process. + """ + response = self.collection.get(document_id) + + if response is None: + raise KeyError(f"Document with ID: {document_id} not found.") + + if response["data"]["user"] != self.user: + raise Exception(f"User mismatch. Feedback with ID: {document_id} does not belong to user: {self.user}") + + try: + response = self.collection.delete(document_id) + print(f"Deleted document: {document_id} !") + + return True + except Exception as e: + print(e) + raise Exception("Not able to delete the data.") diff --git a/comps/chathistory/arango/chat.py b/comps/chathistory/arango/chat.py new file mode 100644 index 000000000..ce9c0a16e --- /dev/null +++ b/comps/chathistory/arango/chat.py @@ -0,0 +1,146 @@ +ο»Ώimport os +from typing import Optional + +from arango_store import DocumentStore +from fastapi import HTTPException +from pydantic import BaseModel + +from comps import CustomLogger +from comps.cores.mega.micro_service import opea_microservices, register_microservice +from comps.cores.proto.api_protocol import ChatCompletionRequest + +logger = CustomLogger("chathistory_arango") +logflag = os.getenv("LOGFLAG", False) + + +class ChatMessage(BaseModel): + data: ChatCompletionRequest + first_query: Optional[str] = None + id: Optional[str] = None + + +class ChatId(BaseModel): + user: str + id: Optional[str] = None + + +def get_first_string(value): + if isinstance(value, str): + return value + elif isinstance(value, list): + # Assuming we want the first string from the first dictionary + if value and isinstance(value[0], dict): + first_dict = value[0] + if first_dict: + # Get the first value from the dictionary + first_key = next(iter(first_dict)) + return first_dict[first_key] + + +@register_microservice( + name="opea_service@chathistory_arango", + endpoint="/v1/chathistory/create", + host="0.0.0.0", + input_datatype=ChatMessage, + port=6012, +) +async def create_documents(document: ChatMessage): + """Creates or updates a document in the document store. + + Args: + document (ChatMessage): The ChatMessage object containing the data to be stored. + + Returns: + The result of the operation if successful, None otherwise. + """ + if logflag: + logger.info(document) + try: + if document.data.user is None: + raise HTTPException(status_code=500, detail="Please provide the user information") + store = DocumentStore(document.data.user) + store.initialize_storage() + if document.first_query is None: + document.first_query = get_first_string(document.data.messages) + if document.id: + res = store.update_document(document.id, document.data, document.first_query) + else: + res = store.save_document(document) + if logflag: + logger.info(res) + return res + except Exception as e: + logger.info(f"An error occurred: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + + +@register_microservice( + name="opea_service@chathistory_arango", + endpoint="/v1/chathistory/get", + host="0.0.0.0", + input_datatype=ChatId, + port=6012, +) +async def get_documents(document: ChatId): + """Retrieves documents from the document store based on the provided ChatId. + + Args: + document (ChatId): The ChatId object containing the user and optional document id. + + Returns: + The retrieved documents if successful, None otherwise. + """ + if logflag: + logger.info(document) + try: + store = DocumentStore(document.user) + store.initialize_storage() + if document.id is None: + res = store.get_all_documents_of_user() + else: + res = store.get_user_documents_by_id(document.id) + if logflag: + logger.info(res) + return res + except Exception as e: + # Handle the exception here + logger.info(f"An error occurred: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + + +@register_microservice( + name="opea_service@chathistory_arango", + endpoint="/v1/chathistory/delete", + host="0.0.0.0", + input_datatype=ChatId, + port=6012, +) +async def delete_documents(document: ChatId): + """Deletes a document from the document store based on the provided ChatId. + + Args: + document (ChatId): The ChatId object containing the user and document id. + + Returns: + The result of the deletion if successful, None otherwise. + """ + if logflag: + logger.info(document) + try: + store = DocumentStore(document.user) + store.initialize_storage() + if document.id is None: + raise Exception("Document id is required.") + else: + res = store.delete_document(document.id) + if logflag: + logger.info(res) + return res + except Exception as e: + # Handle the exception here + logger.info(f"An error occurred: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + + +if __name__ == "__main__": + opea_microservices["opea_service@chathistory_arango"].start() diff --git a/comps/chathistory/arango/config.py b/comps/chathistory/arango/config.py new file mode 100644 index 000000000..9e66e8f1d --- /dev/null +++ b/comps/chathistory/arango/config.py @@ -0,0 +1,13 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +# ARANGO configuration +ARANGO_HOST = os.getenv("ARANGO_HOST", "localhost") +ARANGO_PORT = os.getenv("ARANGO_PORT", 8529) +ARANGO_PROTOCOL = os.getenv("ARANGO_PROTOCOL", "http") +ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") +ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD", "test") +DB_NAME = os.getenv("DB_NAME", "OPEA") +COLLECTION_NAME = os.getenv("COLLECTION_NAME", "ChatHistory") diff --git a/comps/chathistory/arango/docker-compose-chathistory-arango.yaml b/comps/chathistory/arango/docker-compose-chathistory-arango.yaml new file mode 100644 index 000000000..36819c99b --- /dev/null +++ b/comps/chathistory/arango/docker-compose-chathistory-arango.yaml @@ -0,0 +1,38 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3" +services: + arango: + image: arangodb/arangodb:latest + container_name: arangodb + ports: + - 8529:8529 + environment: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + no_proxy: ${no_proxy} + ARANGO_ROOT_PASSWORD: ${ARANGO_ROOT_PASSWORD} + + chathistory-arango: + image: opea/chathistory-arango:latest + container_name: chathistory-arango-server + ports: + - "6012:6012" + ipc: host + environment: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + no_proxy: ${no_proxy} + ARANGO_HOST: ${ARANGO_HOST} + ARANGO_PORT: ${ARANGO_PORT} + ARANGO_PROTOCOL: ${ARANGO_PROTOCOL} + ARANGO_USERNAME: ${ARANGO_USERNAME} + ARANGO_PASSWORD: ${ARANGO_PASSWORD} + DB_NAME: ${DB_NAME} + COLLECTION_NAME: ${COLLECTION_NAME} + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/chathistory/arango/requirements.txt b/comps/chathistory/arango/requirements.txt new file mode 100644 index 000000000..9e5d0de8e --- /dev/null +++ b/comps/chathistory/arango/requirements.txt @@ -0,0 +1 @@ +python-arango \ No newline at end of file diff --git a/comps/prompt_registry/arango/config.py b/comps/prompt_registry/arango/config.py index 9719f1358..e597df0fb 100644 --- a/comps/prompt_registry/arango/config.py +++ b/comps/prompt_registry/arango/config.py @@ -4,7 +4,7 @@ import os # ARANGO configuration -ARANGO_HOST = os.getenv("ARANGODB_HOST", "localhost") +ARANGO_HOST = os.getenv("ARANGO_HOST", "localhost") ARANGO_PORT = os.getenv("ARANGO_PORT", 8529) ARANGO_PROTOCOL = os.getenv("ARANGO_PROTOCOL", "http") ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") From 9d46b278e46b31da63e360e4a4d41a600f6faa47 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Tue, 26 Nov 2024 18:01:17 -0500 Subject: [PATCH 10/22] update ChatHistory README --- comps/chathistory/README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/comps/chathistory/README.md b/comps/chathistory/README.md index 4f7bcbf71..754fd0bd8 100644 --- a/comps/chathistory/README.md +++ b/comps/chathistory/README.md @@ -24,3 +24,7 @@ The Chat History microservice able to support various database backends for stor ### Chat History with MongoDB For more detail, please refer to this [README](./mongo/README.md) + +### Chat History with ArangoDB + +For more detail, please refer to this [README](./arango/README.md) From 029c1fdfe44b85acdcb9d06a6a13020744504d69 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Tue, 26 Nov 2024 18:09:20 -0500 Subject: [PATCH 11/22] new: tests --- tests/chathistory/test_chathistory_arango.sh | 91 ++++++++++++++ .../test_feedback_management_arango.sh | 113 ++++++++++++++++++ .../test_prompt_registry_arango.sh | 89 ++++++++++++++ 3 files changed, 293 insertions(+) create mode 100644 tests/chathistory/test_chathistory_arango.sh create mode 100644 tests/feedback_management/test_feedback_management_arango.sh create mode 100644 tests/prompt_registry/test_prompt_registry_arango.sh diff --git a/tests/chathistory/test_chathistory_arango.sh b/tests/chathistory/test_chathistory_arango.sh new file mode 100644 index 000000000..50481262f --- /dev/null +++ b/tests/chathistory/test_chathistory_arango.sh @@ -0,0 +1,91 @@ +ο»Ώ#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -x + +WORKPATH=$(dirname "$PWD") +ip_address=$(hostname -I | awk '{print $1}') + +export ARANGO_HOST=${ip_address} +export ARANGO_PORT=8529 +export ARANGO_PROTOCOL=${ARANGO_PROTOCOL:-"http"} +export ARANGO_USERNAME=${ARANGO_USERNAME:-"root"} +export ARANGO_PASSWORD=${ARANGO_PASSWORD:-"test"} +export DB_NAME=${DB_NAME:-"Conversations"} +export COLLECTION_NAME=${COLLECTION_NAME:-"test"} + +function build_docker_images() { + cd $WORKPATH + echo $(pwd) + docker run -d -p 8529:8529 --name=test-comps-arango arangodb/arangodb:latest + + docker build --no-cache -t opea/chathistory-arango-server:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/chathistory/arango/Dockerfile . + if [ $? -ne 0 ]; then + echo "opea/chathistory-arango-server built fail" + exit 1 + else + echo "opea/chathistory-arango-server built successful" + fi +} + +function start_service() { + + docker run -d --name="test-comps-chathistory-arango-server" \ + -p 6012:6012 \ + -e http_proxy=$http_proxy \ + -e https_proxy=$https_proxy \ + -e no_proxy=$no_proxy \ + -e ARANGO_HOST=${ARANGO_HOST} \ + -e ARANGO_PORT=${ARANGO_PORT} \ + -e ARANGO_PROTOCOL=${ARANGO_PROTOCOL} \ + -e ARANGO_USERNAME=${ARANGO_USERNAME} \ + -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ + -e DB_NAME=${DB_NAME} \ + -e COLLECTION_NAME=${COLLECTION_NAME} \ + opea/chathistory-arango-server:comps + + sleep 10s +} + +function validate_microservice() { + result=$(curl -X 'POST' \ + http://${ip_address}:6012/v1/chathistory/create \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "data": { + "messages": "test Messages", "user": "test" + } +}') + echo $result + if [[ ${#result} -eq 26 ]]; then + echo "Result correct." + else + echo "Result wrong." + docker logs test-comps-chathistory-arango-server + exit 1 + fi + +} + +function stop_docker() { + cid=$(docker ps -aq --filter "name=test-comps*") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + echo y | docker system prune + +} + +main diff --git a/tests/feedback_management/test_feedback_management_arango.sh b/tests/feedback_management/test_feedback_management_arango.sh new file mode 100644 index 000000000..925555030 --- /dev/null +++ b/tests/feedback_management/test_feedback_management_arango.sh @@ -0,0 +1,113 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -xe + +WORKPATH=$(dirname "$PWD") +ip_address=$(hostname -I | awk '{print $1}') + +export ARANGO_HOST=${ip_address} +export ARANGO_PORT=8529 +export ARANGO_PROTOCOL=${ARANGO_PROTOCOL:-"http"} +export ARANGO_USERNAME=${ARANGO_USERNAME:-"root"} +export ARANGO_PASSWORD=${ARANGO_PASSWORD:-"test"} +export DB_NAME=${DB_NAME:-"Feedback"} +export COLLECTION_NAME=${COLLECTION_NAME:-"test"} + +function build_docker_images() { + cd $WORKPATH + echo $(pwd) + docker run -d -p 8529:8529 --name=test-comps-arango arangodb/arangodb:latest + + docker build --no-cache -t opea/feedbackmanagement-arango-server:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/feedback_management/arango/Dockerfile . + if [ $? -ne 0 ]; then + echo "opea/feedbackmanagement-arango-server built fail" + exit 1 + else + echo "opea/feedbackmanagement-arango-server built successful" + fi +} + +function start_service() { + + docker run -d --name="test-comps-feedbackmanagement-arango-server" \ + -p 6016:6016 \ + -e http_proxy=$http_proxy \ + -e https_proxy=$https_proxy \ + -e no_proxy=$no_proxy \ + -e ARANGO_HOST=${ARANGO_HOST} \ + -e ARANGO_PORT=${ARANGO_PORT} \ + -e ARANGO_PROTOCOL=${ARANGO_PROTOCOL} \ + -e ARANGO_USERNAME=${ARANGO_USERNAME} \ + -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ + -e DB_NAME=${DB_NAME} \ + -e COLLECTION_NAME=${COLLECTION_NAME} \ + opea/feedbackmanagement-arango-server:comps + + sleep 10s +} + +function validate_microservice() { + result=$(curl -X 'POST' \ + http://$ip_address:6016/v1/feedback/create \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "chat_id": "66445d4f71c7eff23d44f78d", + "chat_data": { + "user": "test", + "messages": [ + { + "role": "system", + "content": "You are helpful assistant" + }, + { + "role": "user", + "content": "hi", + "time": "1724915247" + }, + { + "role": "assistant", + "content": "Hi, may I help you?", + "time": "1724915249" + } + ] + }, + "feedback_data": { + "comment": "Moderate", + "rating": 3, + "is_thumbs_up": true + } +}') + echo $result + if [[ ${#result} -eq 26 ]]; then + echo "Correct result." + else + echo "Incorrect result." + docker logs test-comps-feedbackmanagement-arango-server + exit 1 + fi + +} + +function stop_docker() { + cid=$(docker ps -aq --filter "name=test-comps*") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + echo y | docker system prune + +} + +main diff --git a/tests/prompt_registry/test_prompt_registry_arango.sh b/tests/prompt_registry/test_prompt_registry_arango.sh new file mode 100644 index 000000000..abc15ee7f --- /dev/null +++ b/tests/prompt_registry/test_prompt_registry_arango.sh @@ -0,0 +1,89 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -x + +WORKPATH=$(dirname "$PWD") +ip_address=$(hostname -I | awk '{print $1}') + +export ARANGO_HOST=${ip_address} +export ARANGO_PORT=8529 +export ARANGO_PROTOCOL=${ARANGO_PROTOCOL:-"http"} +export ARANGO_USERNAME=${ARANGO_USERNAME:-"root"} +export ARANGO_PASSWORD=${ARANGO_PASSWORD:-"test"} +export DB_NAME=${DB_NAME:-"Prompts"} +export COLLECTION_NAME=${COLLECTION_NAME:-"test"} + +function build_docker_images() { + cd $WORKPATH + echo $(pwd) + docker run -d -p 8529:8529 --name=test-comps-arango arangodb/arangodb:latest + + docker build --no-cache -t opea/promptregistry-arango-server:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/prompt_registry/arango/Dockerfile . + if [ $? -ne 0 ]; then + echo "opea/promptregistry-arango-server built fail" + exit 1 + else + echo "opea/promptregistry-arango-server built successful" + fi +} + +function start_service() { + + docker run -d --name="test-comps-promptregistry-arango-server" \ + -p 6018:6018 \ + -e http_proxy=$http_proxy \ + -e https_proxy=$https_proxy \ + -e no_proxy=$no_proxy \ + -e ARANGO_HOST=${ARANGO_HOST} \ + -e ARANGO_PORT=${ARANGO_PORT} \ + -e ARANGO_PROTOCOL=${ARANGO_PROTOCOL} \ + -e ARANGO_USERNAME=${ARANGO_USERNAME} \ + -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ + -e DB_NAME=${DB_NAME} \ + -e COLLECTION_NAME=${COLLECTION_NAME} \ + opea/promptregistry-arango-server:comps + + sleep 10s +} + +function validate_microservice() { + result=$(curl -X 'POST' \ + http://$ip_address:6018/v1/prompt/create \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "prompt_text": "test prompt", "user": "test" +}') + echo $result + if [[ ${#result} -eq 26 ]]; then + echo "Correct result." + else + echo "Incorrect result." + docker logs test-comps-promptregistry-arango-server + exit 1 + fi + +} + +function stop_docker() { + cid=$(docker ps -aq --filter "name=test-comps*") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + echo y | docker system prune + +} + +main From 3a8060710d3bce0dd4a55bd1582a342bbc332822 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Tue, 26 Nov 2024 18:16:14 -0500 Subject: [PATCH 12/22] update: docker compose workflows --- .github/workflows/docker/compose/chathistory-compose.yaml | 4 ++++ .../docker/compose/feedback_management-compose.yaml | 8 ++++++-- .../workflows/docker/compose/prompt_registry-compose.yaml | 4 ++++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docker/compose/chathistory-compose.yaml b/.github/workflows/docker/compose/chathistory-compose.yaml index 987447fee..64dc579fc 100644 --- a/.github/workflows/docker/compose/chathistory-compose.yaml +++ b/.github/workflows/docker/compose/chathistory-compose.yaml @@ -7,3 +7,7 @@ services: build: dockerfile: comps/chathistory/mongo/Dockerfile image: ${REGISTRY:-opea}/chathistory-mongo-server:${TAG:-latest} + chathistory-arango-server: + build: + dockerfile: comps/chathistory/arango/Dockerfile + image: ${REGISTRY:-opea}/chathistory-arango-server:${TAG:-latest} diff --git a/.github/workflows/docker/compose/feedback_management-compose.yaml b/.github/workflows/docker/compose/feedback_management-compose.yaml index 0a3cfce66..51f5ae343 100644 --- a/.github/workflows/docker/compose/feedback_management-compose.yaml +++ b/.github/workflows/docker/compose/feedback_management-compose.yaml @@ -3,7 +3,11 @@ # this file should be run in the root of the repo services: - feedbackmanagement: + feedbackmanagement-mongo-server: build: dockerfile: comps/feedback_management/mongo/Dockerfile - image: ${REGISTRY:-opea}/feedbackmanagement:${TAG:-latest} + image: ${REGISTRY:-opea}/feedbackmanagement-mongo-server:${TAG:-latest} + feedbackmanagement-arango-server: + build: + dockerfile: comps/feedback_management/arango/Dockerfile + image: ${REGISTRY:-opea}/feedbackmanagement-arango-server:${TAG:-latest} \ No newline at end of file diff --git a/.github/workflows/docker/compose/prompt_registry-compose.yaml b/.github/workflows/docker/compose/prompt_registry-compose.yaml index 34d8973df..4415a18a9 100644 --- a/.github/workflows/docker/compose/prompt_registry-compose.yaml +++ b/.github/workflows/docker/compose/prompt_registry-compose.yaml @@ -7,3 +7,7 @@ services: build: dockerfile: comps/prompt_registry/mongo/Dockerfile image: ${REGISTRY:-opea}/promptregistry-mongo-server:${TAG:-latest} + promptregistry-arango-server: + build: + dockerfile: comps/prompt_registry/arango/Dockerfile + image: ${REGISTRY:-opea}/promptregistry-arango-server:${TAG:-latest} From fabd85c091dd02d83a662cc1aee147b75c6a78f4 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Wed, 27 Nov 2024 13:14:37 -0500 Subject: [PATCH 13/22] fix: `arango` --- comps/chathistory/arango/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comps/chathistory/arango/Dockerfile b/comps/chathistory/arango/Dockerfile index f402e5526..0977bdc45 100644 --- a/comps/chathistory/arango/Dockerfile +++ b/comps/chathistory/arango/Dockerfile @@ -25,6 +25,6 @@ RUN pip install --no-cache-dir --upgrade pip setuptools && \ ENV PYTHONPATH=/home/user -WORKDIR /home/user/comps/chathistory/mongo +WORKDIR /home/user/comps/chathistory/arango ENTRYPOINT ["python", "chat.py"] From 0c21ff5e88f344108f7d1fb9d7146f3e58f3c05a Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Wed, 27 Nov 2024 16:42:11 -0500 Subject: [PATCH 14/22] fix: python path --- comps/chathistory/arango/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comps/chathistory/arango/Dockerfile b/comps/chathistory/arango/Dockerfile index 0977bdc45..fbd7d2e9c 100644 --- a/comps/chathistory/arango/Dockerfile +++ b/comps/chathistory/arango/Dockerfile @@ -23,7 +23,7 @@ RUN pip install --no-cache-dir --upgrade pip setuptools && \ pip install --no-cache-dir -r /home/user/comps/chathistory/arango/requirements.txt && \ pip install --no-cache-dir -r /home/user/requirements.txt -ENV PYTHONPATH=/home/user +ENV PYTHONPATH=$PYTHONPATH:/home/user WORKDIR /home/user/comps/chathistory/arango From c5b936ba38a360975b4cc590575642a75467da6d Mon Sep 17 00:00:00 2001 From: Anthony Mahanna <43019056+aMahanna@users.noreply.github.com> Date: Mon, 9 Dec 2024 10:15:56 -0500 Subject: [PATCH 15/22] rename arango envs (#14) * initial commit: rename arango envs * fix comment --- comps/chathistory/arango/README.md | 16 ++++++---------- comps/chathistory/arango/arango_conn.py | 10 +++++----- comps/chathistory/arango/arango_store.py | 8 ++++---- comps/chathistory/arango/config.py | 10 ++++------ .../docker-compose-chathistory-arango.yaml | 8 +++----- comps/feedback_management/arango/README.md | 16 ++++++---------- comps/feedback_management/arango/arango_conn.py | 10 +++++----- comps/feedback_management/arango/arango_store.py | 8 ++++---- comps/feedback_management/arango/config.py | 10 ++++------ .../docker-compose-user-feedback-arango.yaml | 8 +++----- comps/prompt_registry/arango/README.md | 16 ++++++---------- comps/prompt_registry/arango/arango_conn.py | 10 +++++----- comps/prompt_registry/arango/arango_store.py | 9 ++++----- comps/prompt_registry/arango/config.py | 10 ++++------ .../docker-compose-prompt-registry-arango.yaml | 8 +++----- tests/chathistory/test_chathistory_arango.sh | 16 ++++++---------- .../test_feedback_management_arango.sh | 16 ++++++---------- .../test_prompt_registry_arango.sh | 16 ++++++---------- 18 files changed, 84 insertions(+), 121 deletions(-) diff --git a/comps/chathistory/arango/README.md b/comps/chathistory/arango/README.md index 428a65255..b0379cb40 100644 --- a/comps/chathistory/arango/README.md +++ b/comps/chathistory/arango/README.md @@ -9,13 +9,11 @@ This README provides setup guides and all the necessary information about the Ch See `config.py` for default values. ```bash -export ARANGO_HOST=${ARANGO_HOST} -export ARANGO_PORT=${ARANGO_PORT} -export ARANGO_PROTOCOL=${ARANGO_PROTOCOL} +export ARANGO_URL=${ARANGO_URL} export ARANGO_USERNAME=${ARANGO_USERNAME} export ARANGO_PASSWORD=${ARANGO_PASSWORD} -export DB_NAME=${DB_NAME} -export COLLECTION_NAME=${COLLECTION_NAME} +export ARANGO_DB_NAME=${ARANGO_DB_NAME} +export ARANGO_COLLECTION_NAME=${ARANGO_COLLECTION_NAME} ``` --- @@ -44,13 +42,11 @@ docker build -t opea/chathistory-arango-server:latest --build-arg https_proxy=$h -e http_proxy=$http_proxy \ -e https_proxy=$https_proxy \ -e no_proxy=$no_proxy \ - -e ARANGO_HOST=${ARANGO_HOST} \ - -e ARANGO_PORT=${ARANGO_PORT} \ - -e ARANGO_PROTOCOL=${ARANGO_PROTOCOL} \ + -e ARANGO_URL=${ARANGO_URL} \ -e ARANGO_USERNAME=${ARANGO_USERNAME} \ -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ - -e DB_NAME=${DB_NAME} \ - -e COLLECTION_NAME=${COLLECTION_NAME} \ + -e ARANGO_DB_NAME=${ARANGO_DB_NAME} \ + -e ARANGO_COLLECTION_NAME=${ARANGO_COLLECTION_NAME} \ opea/chathistory-arango-server:latest ``` diff --git a/comps/chathistory/arango/arango_conn.py b/comps/chathistory/arango/arango_conn.py index d6c4b5977..c5c271c1e 100644 --- a/comps/chathistory/arango/arango_conn.py +++ b/comps/chathistory/arango/arango_conn.py @@ -3,11 +3,11 @@ from arango import ArangoClient as PythonArangoClient from arango.database import StandardDatabase -from config import ARANGO_HOST, ARANGO_PASSWORD, ARANGO_PORT, ARANGO_PROTOCOL, ARANGO_USERNAME, DB_NAME +from config import ARANGO_URL, ARANGO_PASSWORD, ARANGO_USERNAME, ARANGO_DB_NAME class ArangoClient: - conn_url = f"{ARANGO_PROTOCOL}://{ARANGO_HOST}:{ARANGO_PORT}/" + conn_url = ARANGO_URL @staticmethod def get_db_client() -> StandardDatabase: @@ -19,11 +19,11 @@ def get_db_client() -> StandardDatabase: sys_db = client.db("_system", username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) # Create target database if it doesn't exist - if not sys_db.has_database(DB_NAME): - sys_db.create_database(DB_NAME) + if not sys_db.has_database(ARANGO_DB_NAME): + sys_db.create_database(ARANGO_DB_NAME) # Now connect to the target database - db = client.db(DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + db = client.db(ARANGO_DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) return db diff --git a/comps/chathistory/arango/arango_store.py b/comps/chathistory/arango/arango_store.py index 8ab6928eb..de25bd9ca 100644 --- a/comps/chathistory/arango/arango_store.py +++ b/comps/chathistory/arango/arango_store.py @@ -1,7 +1,7 @@ from typing import Any from arango_conn import ArangoClient -from config import COLLECTION_NAME +from config import ARANGO_COLLECTION_NAME from pydantic import BaseModel @@ -16,10 +16,10 @@ def __init__( def initialize_storage(self) -> None: self.db_client = ArangoClient.get_db_client() - if not self.db_client.has_collection(COLLECTION_NAME): - self.db_client.create_collection(COLLECTION_NAME) + if not self.db_client.has_collection(ARANGO_COLLECTION_NAME): + self.db_client.create_collection(ARANGO_COLLECTION_NAME) - self.collection = self.db_client.collection(COLLECTION_NAME) + self.collection = self.db_client.collection(ARANGO_COLLECTION_NAME) def save_document(self, document: BaseModel) -> str: """Stores a new document into the storage. diff --git a/comps/chathistory/arango/config.py b/comps/chathistory/arango/config.py index 9e66e8f1d..f7351fcb4 100644 --- a/comps/chathistory/arango/config.py +++ b/comps/chathistory/arango/config.py @@ -3,11 +3,9 @@ import os -# ARANGO configuration -ARANGO_HOST = os.getenv("ARANGO_HOST", "localhost") -ARANGO_PORT = os.getenv("ARANGO_PORT", 8529) -ARANGO_PROTOCOL = os.getenv("ARANGO_PROTOCOL", "http") +# ArangoDB configuration +ARANGO_URL = os.getenv("ARANGO_URL", "http://localhost:8529") ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD", "test") -DB_NAME = os.getenv("DB_NAME", "OPEA") -COLLECTION_NAME = os.getenv("COLLECTION_NAME", "ChatHistory") +ARANGO_DB_NAME = os.getenv("ARANGO_DB_NAME", "OPEA") +ARANGO_COLLECTION_NAME = os.getenv("ARANGO_COLLECTION_NAME", "ChatHistory") diff --git a/comps/chathistory/arango/docker-compose-chathistory-arango.yaml b/comps/chathistory/arango/docker-compose-chathistory-arango.yaml index 36819c99b..218ec1b63 100644 --- a/comps/chathistory/arango/docker-compose-chathistory-arango.yaml +++ b/comps/chathistory/arango/docker-compose-chathistory-arango.yaml @@ -24,13 +24,11 @@ services: http_proxy: ${http_proxy} https_proxy: ${https_proxy} no_proxy: ${no_proxy} - ARANGO_HOST: ${ARANGO_HOST} - ARANGO_PORT: ${ARANGO_PORT} - ARANGO_PROTOCOL: ${ARANGO_PROTOCOL} + ARANGO_URL: ${ARANGO_URL} ARANGO_USERNAME: ${ARANGO_USERNAME} ARANGO_PASSWORD: ${ARANGO_PASSWORD} - DB_NAME: ${DB_NAME} - COLLECTION_NAME: ${COLLECTION_NAME} + ARANGO_DB_NAME: ${ARANGO_DB_NAME} + ARANGO_COLLECTION_NAME: ${ARANGO_COLLECTION_NAME} restart: unless-stopped networks: diff --git a/comps/feedback_management/arango/README.md b/comps/feedback_management/arango/README.md index 7e9a5f840..758a74430 100644 --- a/comps/feedback_management/arango/README.md +++ b/comps/feedback_management/arango/README.md @@ -9,13 +9,11 @@ This README provides setup guides and all the necessary information about the Fe See `config.py` for default values. ```bash -export ARANGO_HOST=${ARANGO_HOST} -export ARANGO_PORT=${ARANGO_PORT} -export ARANGO_PROTOCOL=${ARANGO_PROTOCOL} +export ARANGO_URL=${ARANGO_URL} export ARANGO_USERNAME=${ARANGO_USERNAME} export ARANGO_PASSWORD=${ARANGO_PASSWORD} -export DB_NAME=${DB_NAME} -export COLLECTION_NAME=${COLLECTION_NAME} +export ARANGO_DB_NAME=${ARANGO_DB_NAME} +export ARANGO_COLLECTION_NAME=${ARANGO_COLLECTION_NAME} export PYTHONPATH={Path to base of directory} ``` @@ -46,13 +44,11 @@ docker build -t opea/feedbackmanagement-arango-server:latest --build-arg https_p -e http_proxy=$http_proxy \ -e https_proxy=$https_proxy \ -e no_proxy=$no_proxy \ - -e ARANGO_HOST=${ARANGO_HOST} \ - -e ARANGO_PORT=${ARANGO_PORT} \ - -e ARANGO_PROTOCOL=${ARANGO_PROTOCOL} \ + -e ARANGO_URL=${ARANGO_URL} \ -e ARANGO_USERNAME=${ARANGO_USERNAME} \ -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ - -e DB_NAME=${DB_NAME} \ - -e COLLECTION_NAME=${COLLECTION_NAME} \ + -e ARANGO_DB_NAME=${ARANGO_DB_NAME} \ + -e ARANGO_COLLECTION_NAME=${ARANGO_COLLECTION_NAME} \ opea/feedbackmanagement-arango-server:latest ``` diff --git a/comps/feedback_management/arango/arango_conn.py b/comps/feedback_management/arango/arango_conn.py index d6c4b5977..c5c271c1e 100644 --- a/comps/feedback_management/arango/arango_conn.py +++ b/comps/feedback_management/arango/arango_conn.py @@ -3,11 +3,11 @@ from arango import ArangoClient as PythonArangoClient from arango.database import StandardDatabase -from config import ARANGO_HOST, ARANGO_PASSWORD, ARANGO_PORT, ARANGO_PROTOCOL, ARANGO_USERNAME, DB_NAME +from config import ARANGO_URL, ARANGO_PASSWORD, ARANGO_USERNAME, ARANGO_DB_NAME class ArangoClient: - conn_url = f"{ARANGO_PROTOCOL}://{ARANGO_HOST}:{ARANGO_PORT}/" + conn_url = ARANGO_URL @staticmethod def get_db_client() -> StandardDatabase: @@ -19,11 +19,11 @@ def get_db_client() -> StandardDatabase: sys_db = client.db("_system", username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) # Create target database if it doesn't exist - if not sys_db.has_database(DB_NAME): - sys_db.create_database(DB_NAME) + if not sys_db.has_database(ARANGO_DB_NAME): + sys_db.create_database(ARANGO_DB_NAME) # Now connect to the target database - db = client.db(DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + db = client.db(ARANGO_DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) return db diff --git a/comps/feedback_management/arango/arango_store.py b/comps/feedback_management/arango/arango_store.py index cd22b8078..a20a6147d 100644 --- a/comps/feedback_management/arango/arango_store.py +++ b/comps/feedback_management/arango/arango_store.py @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 from arango_conn import ArangoClient -from config import COLLECTION_NAME +from config import ARANGO_COLLECTION_NAME from pydantic import BaseModel @@ -17,10 +17,10 @@ def __init__( def initialize_storage(self) -> None: self.db_client = ArangoClient.get_db_client() - if not self.db_client.has_collection(COLLECTION_NAME): - self.db_client.create_collection(COLLECTION_NAME) + if not self.db_client.has_collection(ARANGO_COLLECTION_NAME): + self.db_client.create_collection(ARANGO_COLLECTION_NAME) - self.collection = self.db_client.collection(COLLECTION_NAME) + self.collection = self.db_client.collection(ARANGO_COLLECTION_NAME) def save_feedback(self, feedback_data: BaseModel) -> str: """Stores a new feedback data into the storage. diff --git a/comps/feedback_management/arango/config.py b/comps/feedback_management/arango/config.py index bb790eb38..36826b1e9 100644 --- a/comps/feedback_management/arango/config.py +++ b/comps/feedback_management/arango/config.py @@ -3,11 +3,9 @@ import os -# ARANGO configuration -ARANGO_HOST = os.getenv("ARANGO_HOST", "localhost") -ARANGO_PORT = os.getenv("ARANGO_PORT", 8529) -ARANGO_PROTOCOL = os.getenv("ARANGO_PROTOCOL", "http") +# ArangoDB configuration +ARANGO_URL = os.getenv("ARANGO_URL", "http://localhost:8529") ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD", "test") -DB_NAME = os.getenv("DB_NAME", "OPEA") -COLLECTION_NAME = os.getenv("COLLECTION_NAME", "Feedback") +ARANGO_DB_NAME = os.getenv("ARANGO_DB_NAME", "OPEA") +ARANGO_COLLECTION_NAME = os.getenv("ARANGO_COLLECTION_NAME", "Feedback") diff --git a/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml b/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml index 8f9b3a85a..f01c5d03f 100644 --- a/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml +++ b/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml @@ -24,13 +24,11 @@ services: http_proxy: ${http_proxy} https_proxy: ${https_proxy} no_proxy: ${no_proxy} - ARANGO_HOST: ${ARANGO_HOST} - ARANGO_PORT: ${ARANGO_PORT} - ARANGO_PROTOCOL: ${ARANGO_PROTOCOL} + ARANGO_URL: ${ARANGO_URL} ARANGO_USERNAME: ${ARANGO_USERNAME} ARANGO_PASSWORD: ${ARANGO_PASSWORD} - DB_NAME: ${DB_NAME} - COLLECTION_NAME: ${COLLECTION_NAME} + ARANGO_DB_NAME: ${ARANGO_DB_NAME} + ARANGO_COLLECTION_NAME: ${ARANGO_COLLECTION_NAME} restart: unless-stopped networks: diff --git a/comps/prompt_registry/arango/README.md b/comps/prompt_registry/arango/README.md index e4bdd6c10..d746e9ea7 100644 --- a/comps/prompt_registry/arango/README.md +++ b/comps/prompt_registry/arango/README.md @@ -9,13 +9,11 @@ This README provides setup guides and all the necessary information about the Pr See `config.py` for default values. ```bash -export ARANGO_HOST=${ARANGO_HOST} -export ARANGO_PORT=${ARANGO_PORT} -export ARANGO_PROTOCOL=${ARANGO_PROTOCOL} +export ARANGO_URL=${ARANGO_URL} export ARANGO_USERNAME=${ARANGO_USERNAME} export ARANGO_PASSWORD=${ARANGO_PASSWORD} -export DB_NAME=${DB_NAME} -export COLLECTION_NAME=${COLLECTION_NAME} +export ARANGO_DB_NAME=${ARANGO_DB_NAME} +export ARANGO_COLLECTION_NAME=${ARANGO_COLLECTION_NAME} ``` --- @@ -46,13 +44,11 @@ docker build -t opea/promptregistry-arango-server:latest --build-arg https_proxy -e http_proxy=$http_proxy \ -e https_proxy=$https_proxy \ -e no_proxy=$no_proxy \ - -e ARANGO_HOST=${ARANGO_HOST} \ - -e ARANGO_PORT=${ARANGO_PORT} \ - -e ARANGO_PROTOCOL=${ARANGO_PROTOCOL} \ + -e ARANGO_URL=${ARANGO_URL} \ -e ARANGO_USERNAME=${ARANGO_USERNAME} \ -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ - -e DB_NAME=${DB_NAME} \ - -e COLLECTION_NAME=${COLLECTION_NAME} \ + -e ARANGO_DB_NAME=${ARANGO_DB_NAME} \ + -e ARANGO_COLLECTION_NAME=${ARANGO_COLLECTION_NAME} \ opea/promptregistry-arango-server:latest ``` diff --git a/comps/prompt_registry/arango/arango_conn.py b/comps/prompt_registry/arango/arango_conn.py index d6c4b5977..c5c271c1e 100644 --- a/comps/prompt_registry/arango/arango_conn.py +++ b/comps/prompt_registry/arango/arango_conn.py @@ -3,11 +3,11 @@ from arango import ArangoClient as PythonArangoClient from arango.database import StandardDatabase -from config import ARANGO_HOST, ARANGO_PASSWORD, ARANGO_PORT, ARANGO_PROTOCOL, ARANGO_USERNAME, DB_NAME +from config import ARANGO_URL, ARANGO_PASSWORD, ARANGO_USERNAME, ARANGO_DB_NAME class ArangoClient: - conn_url = f"{ARANGO_PROTOCOL}://{ARANGO_HOST}:{ARANGO_PORT}/" + conn_url = ARANGO_URL @staticmethod def get_db_client() -> StandardDatabase: @@ -19,11 +19,11 @@ def get_db_client() -> StandardDatabase: sys_db = client.db("_system", username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) # Create target database if it doesn't exist - if not sys_db.has_database(DB_NAME): - sys_db.create_database(DB_NAME) + if not sys_db.has_database(ARANGO_DB_NAME): + sys_db.create_database(ARANGO_DB_NAME) # Now connect to the target database - db = client.db(DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + db = client.db(ARANGO_DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) return db diff --git a/comps/prompt_registry/arango/arango_store.py b/comps/prompt_registry/arango/arango_store.py index fb80ccd20..17f73532a 100644 --- a/comps/prompt_registry/arango/arango_store.py +++ b/comps/prompt_registry/arango/arango_store.py @@ -5,9 +5,8 @@ from arango.exceptions import IndexGetError from arango_conn import ArangoClient -from config import COLLECTION_NAME +from config import ARANGO_COLLECTION_NAME from prompt import PromptCreate -from pydantic import BaseModel from comps import CustomLogger @@ -27,10 +26,10 @@ def __init__( def initialize_storage(self) -> None: self.db_client = ArangoClient.get_db_client() - if not self.db_client.has_collection(COLLECTION_NAME): - self.db_client.create_collection(COLLECTION_NAME) + if not self.db_client.has_collection(ARANGO_COLLECTION_NAME): + self.db_client.create_collection(ARANGO_COLLECTION_NAME) - self.collection = self.db_client.collection(COLLECTION_NAME) + self.collection = self.db_client.collection(ARANGO_COLLECTION_NAME) def save_prompt(self, prompt: PromptCreate): """Stores a new prompt into the storage. diff --git a/comps/prompt_registry/arango/config.py b/comps/prompt_registry/arango/config.py index e597df0fb..cf048dee9 100644 --- a/comps/prompt_registry/arango/config.py +++ b/comps/prompt_registry/arango/config.py @@ -3,11 +3,9 @@ import os -# ARANGO configuration -ARANGO_HOST = os.getenv("ARANGO_HOST", "localhost") -ARANGO_PORT = os.getenv("ARANGO_PORT", 8529) -ARANGO_PROTOCOL = os.getenv("ARANGO_PROTOCOL", "http") +# ArangoDB configuration +ARANGO_URL = os.getenv("ARANGO_URL", "http://localhost:8529") ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD", "test") -DB_NAME = os.getenv("DB_NAME", "OPEA") -COLLECTION_NAME = os.getenv("COLLECTION_NAME", "Prompt") +ARANGO_DB_NAME = os.getenv("ARANGO_DB_NAME", "OPEA") +ARANGO_COLLECTION_NAME = os.getenv("ARANGO_COLLECTION_NAME", "Prompt") diff --git a/comps/prompt_registry/arango/docker-compose-prompt-registry-arango.yaml b/comps/prompt_registry/arango/docker-compose-prompt-registry-arango.yaml index b1aee077d..335be9411 100644 --- a/comps/prompt_registry/arango/docker-compose-prompt-registry-arango.yaml +++ b/comps/prompt_registry/arango/docker-compose-prompt-registry-arango.yaml @@ -24,13 +24,11 @@ services: http_proxy: ${http_proxy} https_proxy: ${https_proxy} no_proxy: ${no_proxy} - ARANGO_HOST: ${ARANGO_HOST} - ARANGO_PORT: ${ARANGO_PORT} - ARANGO_PROTOCOL: ${ARANGO_PROTOCOL} + ARANGO_URL: ${ARANGO_URL} ARANGO_USERNAME: ${ARANGO_USERNAME} ARANGO_PASSWORD: ${ARANGO_PASSWORD} - DB_NAME: ${DB_NAME} - COLLECTION_NAME: ${COLLECTION_NAME} + ARANGO_DB_NAME: ${DB_NAME} + ARANGO_COLLECTION_NAME: ${COLLECTION_NAME} restart: unless-stopped networks: diff --git a/tests/chathistory/test_chathistory_arango.sh b/tests/chathistory/test_chathistory_arango.sh index 50481262f..f9d731802 100644 --- a/tests/chathistory/test_chathistory_arango.sh +++ b/tests/chathistory/test_chathistory_arango.sh @@ -7,13 +7,11 @@ set -x WORKPATH=$(dirname "$PWD") ip_address=$(hostname -I | awk '{print $1}') -export ARANGO_HOST=${ip_address} -export ARANGO_PORT=8529 -export ARANGO_PROTOCOL=${ARANGO_PROTOCOL:-"http"} +export ARANGO_URL=${ARANGO_URL:-"http://${ip_address}:8529"} export ARANGO_USERNAME=${ARANGO_USERNAME:-"root"} export ARANGO_PASSWORD=${ARANGO_PASSWORD:-"test"} -export DB_NAME=${DB_NAME:-"Conversations"} -export COLLECTION_NAME=${COLLECTION_NAME:-"test"} +export ARANGO_DB_NAME=${ARANGO_DB_NAME:-"Conversations"} +export ARANGO_COLLECTION_NAME=${ARANGO_COLLECTION_NAME:-"test"} function build_docker_images() { cd $WORKPATH @@ -36,13 +34,11 @@ function start_service() { -e http_proxy=$http_proxy \ -e https_proxy=$https_proxy \ -e no_proxy=$no_proxy \ - -e ARANGO_HOST=${ARANGO_HOST} \ - -e ARANGO_PORT=${ARANGO_PORT} \ - -e ARANGO_PROTOCOL=${ARANGO_PROTOCOL} \ + -e ARANGO_URL=${ARANGO_URL} \ -e ARANGO_USERNAME=${ARANGO_USERNAME} \ -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ - -e DB_NAME=${DB_NAME} \ - -e COLLECTION_NAME=${COLLECTION_NAME} \ + -e ARANGO_DB_NAME=${ARANGO_DB_NAME} \ + -e ARANGO_COLLECTION_NAME=${ARANGO_COLLECTION_NAME} \ opea/chathistory-arango-server:comps sleep 10s diff --git a/tests/feedback_management/test_feedback_management_arango.sh b/tests/feedback_management/test_feedback_management_arango.sh index 925555030..6bbd32598 100644 --- a/tests/feedback_management/test_feedback_management_arango.sh +++ b/tests/feedback_management/test_feedback_management_arango.sh @@ -7,13 +7,11 @@ set -xe WORKPATH=$(dirname "$PWD") ip_address=$(hostname -I | awk '{print $1}') -export ARANGO_HOST=${ip_address} -export ARANGO_PORT=8529 -export ARANGO_PROTOCOL=${ARANGO_PROTOCOL:-"http"} +export ARANGO_URL=${ARANGO_URL:-"http://${ip_address}:8529"} export ARANGO_USERNAME=${ARANGO_USERNAME:-"root"} export ARANGO_PASSWORD=${ARANGO_PASSWORD:-"test"} -export DB_NAME=${DB_NAME:-"Feedback"} -export COLLECTION_NAME=${COLLECTION_NAME:-"test"} +export ARANGO_DB_NAME=${ARANGO_DB_NAME:-"Feedback"} +export ARANGO_COLLECTION_NAME=${ARANGO_COLLECTION_NAME:-"test"} function build_docker_images() { cd $WORKPATH @@ -36,13 +34,11 @@ function start_service() { -e http_proxy=$http_proxy \ -e https_proxy=$https_proxy \ -e no_proxy=$no_proxy \ - -e ARANGO_HOST=${ARANGO_HOST} \ - -e ARANGO_PORT=${ARANGO_PORT} \ - -e ARANGO_PROTOCOL=${ARANGO_PROTOCOL} \ + -e ARANGO_URL=${ARANGO_URL} \ -e ARANGO_USERNAME=${ARANGO_USERNAME} \ -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ - -e DB_NAME=${DB_NAME} \ - -e COLLECTION_NAME=${COLLECTION_NAME} \ + -e ARANGO_DB_NAME=${ARANGO_DB_NAME} \ + -e ARANGO_COLLECTION_NAME=${ARANGO_COLLECTION_NAME} \ opea/feedbackmanagement-arango-server:comps sleep 10s diff --git a/tests/prompt_registry/test_prompt_registry_arango.sh b/tests/prompt_registry/test_prompt_registry_arango.sh index abc15ee7f..16d81b17d 100644 --- a/tests/prompt_registry/test_prompt_registry_arango.sh +++ b/tests/prompt_registry/test_prompt_registry_arango.sh @@ -7,13 +7,11 @@ set -x WORKPATH=$(dirname "$PWD") ip_address=$(hostname -I | awk '{print $1}') -export ARANGO_HOST=${ip_address} -export ARANGO_PORT=8529 -export ARANGO_PROTOCOL=${ARANGO_PROTOCOL:-"http"} +export ARANGO_URL=${ARANGO_URL:-"http://${ip_address}:8529"} export ARANGO_USERNAME=${ARANGO_USERNAME:-"root"} export ARANGO_PASSWORD=${ARANGO_PASSWORD:-"test"} -export DB_NAME=${DB_NAME:-"Prompts"} -export COLLECTION_NAME=${COLLECTION_NAME:-"test"} +export ARANGO_DB_NAME=${ARANGO_DB_NAME:-"Prompts"} +export ARANGO_COLLECTION_NAME=${ARANGO_COLLECTION_NAME:-"test"} function build_docker_images() { cd $WORKPATH @@ -36,13 +34,11 @@ function start_service() { -e http_proxy=$http_proxy \ -e https_proxy=$https_proxy \ -e no_proxy=$no_proxy \ - -e ARANGO_HOST=${ARANGO_HOST} \ - -e ARANGO_PORT=${ARANGO_PORT} \ - -e ARANGO_PROTOCOL=${ARANGO_PROTOCOL} \ + -e ARANGO_URL=${ARANGO_URL} \ -e ARANGO_USERNAME=${ARANGO_USERNAME} \ -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ - -e DB_NAME=${DB_NAME} \ - -e COLLECTION_NAME=${COLLECTION_NAME} \ + -e ARANGO_DB_NAME=${ARANGO_DB_NAME} \ + -e ARANGO_COLLECTION_NAME=${ARANGO_COLLECTION_NAME} \ opea/promptregistry-arango-server:comps sleep 10s From 23ac66a3f97aad494d7b80164ae9aff133629a99 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna <43019056+aMahanna@users.noreply.github.com> Date: Mon, 30 Dec 2024 13:16:58 -0500 Subject: [PATCH 16/22] ArangoDB: Dataprep (#12) * initial commit * fix: env * Update README.md * Revert "Update README.md" This reverts commit 8f750e4472e33d9c11bdc39606ea6b6e33fef892. * fix: create database * cleanup * new: chunk embedding generation * new: `cithash` dep * cleanup: `ingest_data_to_arango` * new: envs in `config` * fix: more envs * more env cleanup * fix: deprecated line * fix: graph doc * update dataprep-compose * Dockerfile update and parametrized prepare_doc_arango.py (#15) * Initial readme and prepare doc arango, with embeddings by Anthony * Adding git to Dockerfile, tested dockerfile and dockercompose. Also parametrized variables in prepare_doc_arango.py * Updating readme with adjustable parameters listed * Only printing debug statements if log flag is on * add review * review pt 2 --------- Co-authored-by: Anthony Mahanna * update dataprep readme --------- Co-authored-by: Ajay Kallepalli <72517322+ajaykallepalli@users.noreply.github.com> --- .../docker/compose/dataprep-compose.yaml | 4 + comps/dataprep/README.md | 4 + comps/dataprep/arango/__init__.py | 2 + comps/dataprep/arango/langchain/Dockerfile | 39 ++ comps/dataprep/arango/langchain/README.md | 149 ++++++++ comps/dataprep/arango/langchain/__init__.py | 2 + comps/dataprep/arango/langchain/config.py | 42 +++ .../docker-compose-dataprep-arango.yaml | 53 +++ .../arango/langchain/prepare_doc_arango.py | 342 ++++++++++++++++++ .../arango/langchain/requirements.txt | 32 ++ 10 files changed, 669 insertions(+) create mode 100644 comps/dataprep/arango/__init__.py create mode 100644 comps/dataprep/arango/langchain/Dockerfile create mode 100644 comps/dataprep/arango/langchain/README.md create mode 100644 comps/dataprep/arango/langchain/__init__.py create mode 100644 comps/dataprep/arango/langchain/config.py create mode 100644 comps/dataprep/arango/langchain/docker-compose-dataprep-arango.yaml create mode 100644 comps/dataprep/arango/langchain/prepare_doc_arango.py create mode 100644 comps/dataprep/arango/langchain/requirements.txt diff --git a/.github/workflows/docker/compose/dataprep-compose.yaml b/.github/workflows/docker/compose/dataprep-compose.yaml index 7908e8c26..6053fd3d0 100644 --- a/.github/workflows/docker/compose/dataprep-compose.yaml +++ b/.github/workflows/docker/compose/dataprep-compose.yaml @@ -63,3 +63,7 @@ services: build: dockerfile: comps/dataprep/multimedia2text/audio2text/Dockerfile image: ${REGISTRY:-opea}/dataprep-audio2text:${TAG:-latest} + dataprep-arango: + build: + dockerfile: comps/dataprep/arango/langchain/Dockerfile + image: ${REGISTRY:-opea}/dataprep-arango:${TAG:-latest} \ No newline at end of file diff --git a/comps/dataprep/README.md b/comps/dataprep/README.md index 46a57d37d..02b78ef99 100644 --- a/comps/dataprep/README.md +++ b/comps/dataprep/README.md @@ -44,3 +44,7 @@ For details, please refer to this [readme](vdms/README.md) ## Dataprep Microservice with Multimodal For details, please refer to this [readme](multimodal/redis/langchain/README.md) + +## Dataprep Microservice with ArangoDB + +For details, please refer to this [readme](arango/langchain/README.md) diff --git a/comps/dataprep/arango/__init__.py b/comps/dataprep/arango/__init__.py new file mode 100644 index 000000000..916f3a44b --- /dev/null +++ b/comps/dataprep/arango/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/dataprep/arango/langchain/Dockerfile b/comps/dataprep/arango/langchain/Dockerfile new file mode 100644 index 000000000..5d8aa7a48 --- /dev/null +++ b/comps/dataprep/arango/langchain/Dockerfile @@ -0,0 +1,39 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +ENV LANG=C.UTF-8 + +ARG ARCH="cpu" + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + build-essential \ + default-jre \ + libgl1-mesa-glx \ + libjemalloc-dev \ + git + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps + +RUN pip install --no-cache-dir --upgrade pip setuptools && \ + if [ ${ARCH} = "cpu" ]; then pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu; fi && \ + pip install --no-cache-dir -r /home/user/comps/dataprep/arango/langchain/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +USER root + +RUN mkdir -p /home/user/comps/dataprep/arango/langchain/uploaded_files && chown -R user /home/user/comps/dataprep/arango/langchain/uploaded_files + +USER user + +WORKDIR /home/user/comps/dataprep/arango/langchain + +ENTRYPOINT ["python", "prepare_doc_arango.py"] \ No newline at end of file diff --git a/comps/dataprep/arango/langchain/README.md b/comps/dataprep/arango/langchain/README.md new file mode 100644 index 000000000..37f0a078b --- /dev/null +++ b/comps/dataprep/arango/langchain/README.md @@ -0,0 +1,149 @@ +# Dataprep Microservice with ArangoDB + +## πŸš€Start Microservice with Python + +### Install Requirements + +```bash +pip install -r requirements.txt +apt-get install libtesseract-dev -y +apt-get install poppler-utils -y +``` + +### Start ArangoDB Server + +To launch ArangoDB locally, first ensure you have docker installed. Then, you can launch the database with the following docker command. + +```bash +docker run -d --name arangodb -p 8529:8529 -e ARANGO_ROOT_PASSWORD=password arangodb/arangodb:latest +``` + +### Setup Environment Variables + +```bash +export no_proxy=${your_no_proxy} +export http_proxy=${your_http_proxy} +export https_proxy=${your_http_proxy} +export ARANGO_URL=${your_arango_url} +export ARANGO_USERNAME=${your_arango_username} +export ARANGO_PASSWORD=${your_arango_password} +export ARANGO_DB_NAME=${your_db_name} +export PYTHONPATH=${path_to_comps} +``` + +### Start Document Preparation Microservice for ArangoDB with Python Script + +Start document preparation microservice for ArangoDB with below command. + +```bash +python prepare_doc_arango.py +``` + +## πŸš€Start Microservice with Docker + +### Build Docker Image + +```bash +cd ../../../../ +docker build -t opea/dataprep-arango:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/arango/langchain/Dockerfile . +``` + +### Run Docker with CLI + +```bash +docker run -d --name="dataprep-arango-server" -p 6007:6007 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/dataprep-arango:latest +``` + +### Run Docker with Docker Compose + +```bash +cd comps/dataprep/arango/langchain +docker compose -f docker-compose-dataprep-arango.yaml up -d +``` + +## Invoke Microservice + +Once document preparation microservice for ArangoDB is started, user can use below command to invoke the microservice to convert the document to embedding and save to the database. + +After the service is complete a Graph is created in ArangoDB. The default graph name is `Graph`, you can specify the graph name by `-F "graph_name=${your_graph_name}"` in the curl command. + +By default, the microservice will create embeddings for the documents if embedding environment variables are specified. You can specify `-F "create_embeddings=false"` to skip the embedding creation. + +```bash +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./file1.txt" \ + -F "graph_name=${your_graph_name}" \ + http://localhost:6007/v1/dataprep +``` + +You can specify chunk_size and chunk_size by the following commands. + +```bash +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./file1.txt" \ + -F "chunk_size=1500" \ + -F "chunk_overlap=100" \ + -F "graph_name=${your_graph_name}" \ + http://localhost:6007/v1/dataprep +``` + +We support table extraction from pdf documents. You can specify process_table and table_strategy by the following commands. "table_strategy" refers to the strategies to understand tables for table retrieval. As the setting progresses from "fast" to "hq" to "llm," the focus shifts towards deeper table understanding at the expense of processing speed. The default strategy is "fast". + +Note: If you specify "table_strategy=llm", You should first start TGI Service, please refer to 1.2.1, 1.3.1 in https://github.com/opea-project/GenAIComps/tree/main/comps/llms/README.md, and then `export TGI_LLM_ENDPOINT="http://${your_ip}:8008"`. + +For ensure the quality and comprehensiveness of the extracted entities, we recommend to use `gpt-4o` as the default model for parsing the document. To enable the openai service, please `export OPENAI_API_KEY=xxxx` before using this services. + +```bash +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./your_file.pdf" \ + -F "process_table=true" \ + -F "table_strategy=hq" \ + -F "graph_name=${your_graph_name}" \ + http://localhost:6007/v1/dataprep +``` + +--- + +Additional options that can be specified from the environment variables are as follows (default values are in the config.py file): + +ArangoDB Configuration: +- `ARANGO_URL`: The URL for the ArangoDB service. +- `ARANGO_USERNAME`: The username for the ArangoDB service. +- `ARANGO_PASSWORD`: The password for the ArangoDB service. +- `ARANGO_DB_NAME`: The name of the database to use for the ArangoDB service. +- `USE_ONE_ENTITY_COLLECTION`: If set to True, the microservice will use a single entity collection for all nodes. If set to False, the microservice will use a separate collection by node type. Defaults to `True`. +- `INSERT_ASYNC`: If set to True, the microservice will insert the data into ArangoDB asynchronously. Defaults to `False`. +- `ARANGO_BATCH_SIZE`: The batch size for the microservice to insert the data. Defaults to `500`. + +Text Generation Inference Configuration +- `TGI_LLM_ENDPOINT`: The endpoint for the TGI service. +- `TGI_LLM_MAX_NEW_TOKENS`: The maximum number of new tokens to generate. Defaults to `512`. +- `TGI_LLM_TOP_K`: The number of highest probability vocabulary tokens to keep for top-k-filtering. Defaults to `40`. +- `TGI_LLM_TOP_P`: If set to < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. Defaults to `0.9`. +- `TGI_LLM_TEMPERATURE`: The temperature for the sampling. Defaults to `0.8`. +- `TGI_LLM_TIMEOUT`: The timeout for the TGI service. Defaults to `600`. + +Text Embeddings Inferencing Configuration +**Note**: This is optional functionality to generate embeddings for text chunks. +- `TEI_EMBEDDING_ENDPOINT`: The endpoint for the TEI service. +- `HUGGINGFACEHUB_API_TOKEN`: The API token for the Hugging Face Hub. +- `TEI_EMBED_MODEL`: The model to use for the TEI service. Defaults to `BAAI/bge-base-en-v1.5`. + +OpenAI Configuration: +**Note**: This configuration can replace the TGI and TEI services for text generation and embeddings. +- `OPENAI_API_KEY`: The API key for the OpenAI service. +- `OPENAI_EMBED_MODEL`: The embedding model to use for the OpenAI service. Defaults to `text-embedding-3-small`. +- `OPENAI_EMBED_DIMENSIONS`: The embedding dimension for the OpenAI service. Defaults to `512`. +- `OPENAI_CHAT_MODEL`: The chat model to use for the OpenAI service. Defaults to `gpt-4o`. +- `OPENAI_CHAT_TEMPERATURE`: The temperature for the OpenAI service. Defaults to `0`. + + +[LangChain LLMGraphTransformer](https://api.python.langchain.com/en/latest/graph_transformers/langchain_experimental.graph_transformers.llm.LLMGraphTransformer.html) Configuration: +- `SYSTEM_PROMPT_PATH`: The path to the system prompt text file. This can be used to specify the specific system prompt for the entity extraction and graph generation steps. +- `ALLOWED_NODES`: Specifies which node types are allowed in the graph. Defaults to an empty list, allowing all node types. +- `ALLOWED_RELATIONSHIPS`: Specifies which relationship types are allowed in the graph. Defaults to an empty list, allowing all relationship types. +- `NODE_PROPERTIES`: If True, the LLM can extract any node properties from text. Alternatively, a list of valid properties can be provided for the LLM to extract, restricting extraction to those specified. Defaults to `["description"]`. +- `RELATIONSHIP_PROPERTIES`: If True, the LLM can extract any relationship properties from text. Alternatively, a list of valid properties can be provided for the LLM to extract, restricting extraction to those specified. Defaults to `["description"]`. diff --git a/comps/dataprep/arango/langchain/__init__.py b/comps/dataprep/arango/langchain/__init__.py new file mode 100644 index 000000000..916f3a44b --- /dev/null +++ b/comps/dataprep/arango/langchain/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/dataprep/arango/langchain/config.py b/comps/dataprep/arango/langchain/config.py new file mode 100644 index 000000000..1f2312e59 --- /dev/null +++ b/comps/dataprep/arango/langchain/config.py @@ -0,0 +1,42 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +# ArangoDB configuration +ARANGO_URL = os.getenv("ARANGO_URL", "http://localhost:8529") +ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") +ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD", "test") +ARANGO_DB_NAME = os.getenv("ARANGO_DB_NAME", "_system") + +# ArangoDB graph configuration +USE_ONE_ENTITY_COLLECTION = os.getenv("USE_ONE_ENTITY_COLLECTION", True) +INSERT_ASYNC = os.getenv("INSERT_ASYNC", False) +ARANGO_BATCH_SIZE = os.getenv("ARANGO_BATCH_SIZE", 500) + +# Text Generation Inference configuration +TGI_LLM_ENDPOINT = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080") +TGI_LLM_MAX_NEW_TOKENS = os.getenv("TGI_LLM_MAX_NEW_TOKENS", 512) +TGI_LLM_TOP_K = os.getenv("TGI_LLM_TOP_K", 40) +TGI_LLM_TOP_P = os.getenv("TGI_LLM_TOP_P", 0.9) +TGI_LLM_TEMPERATURE = os.getenv("TGI_LLM_TEMPERATURE", 0.8) +TGI_LLM_TIMEOUT = os.getenv("TGI_LLM_TIMEOUT", 600) + +# Text Embeddings Inference configuration +TEI_EMBEDDING_ENDPOINT = os.getenv("TEI_EMBEDDING_ENDPOINT") +HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN") +TEI_EMBED_MODEL = os.getenv("TEI_EMBED_MODEL", "BAAI/bge-base-en-v1.5") + +# OpenAI configuration (alternative to TGI & TEI) +OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") +OPENAI_EMBED_MODEL = os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small") +OPENAI_EMBED_DIMENSIONS = os.getenv("OPENAI_EMBED_DIMENSIONS", 512) +OPENAI_CHAT_MODEL = os.getenv("OPENAI_CHAT_MODEL", "gpt-4o") +OPENAI_CHAT_TEMPERATURE = os.getenv("OPENAI_CHAT_TEMPERATURE", 0) + +# LLMGraphTransformer configuration +SYSTEM_PROMPT_PATH = os.getenv("SYSTEM_PROMPT_PATH") +ALLOWED_NODES = os.getenv("ALLOWED_NODES", []) +ALLOWED_RELATIONSHIPS = os.getenv("ALLOWED_RELATIONSHIPS", []) +NODE_PROPERTIES = os.getenv("NODE_PROPERTIES", ["description"]) +RELATIONSHIP_PROPERTIES = os.getenv("RELATIONSHIP_PROPERTIES", ["description"]) diff --git a/comps/dataprep/arango/langchain/docker-compose-dataprep-arango.yaml b/comps/dataprep/arango/langchain/docker-compose-dataprep-arango.yaml new file mode 100644 index 000000000..d3a9882c6 --- /dev/null +++ b/comps/dataprep/arango/langchain/docker-compose-dataprep-arango.yaml @@ -0,0 +1,53 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3" +services: + arango-vector-db: + image: arangodb/arangodb:latest + container_name: arango-graph-db + ports: + - "8529:8529" + environment: + ARANGO_ROOT_PASSWORD: ${ARANGO_PASSWORD} + tgi_gaudi_service: + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + container_name: tgi-service + ports: + - "8088:80" + volumes: + - "./data:/data" + shm_size: 1g + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + HF_TOKEN: ${HF_TOKEN} + command: --model-id ${LLM_MODEL_ID} --auto-truncate --max-input-tokens 1024 --max-total-tokens 2048 + dataprep-arango: + image: opea/dataprep-arango:latest + container_name: dataprep-arango-server + depends_on: + - arango-vector-db + - tgi_gaudi_service + ports: + - "6007:6007" + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + ARANGO_URL: ${ARANGO_URL} + ARANGO_USERNAME: ${ARANGO_USERNAME} + ARANGO_PASSWORD: ${ARANGO_PASSWORD} + ARANGO_DB_NAME: ${ARANGO_DB_NAME} + TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT} + TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT} + HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + TEI_EMBED_MODEL: ${TEI_EMBED_MODEL} + OPENAI_API_KEY: ${OPENAI_API_KEY} + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/dataprep/arango/langchain/prepare_doc_arango.py b/comps/dataprep/arango/langchain/prepare_doc_arango.py new file mode 100644 index 000000000..d2d467cff --- /dev/null +++ b/comps/dataprep/arango/langchain/prepare_doc_arango.py @@ -0,0 +1,342 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import json +import os +from typing import List, Optional, Union + +import openai +from arango import ArangoClient +from config import ( + ALLOWED_NODES, + ALLOWED_RELATIONSHIPS, + ARANGO_BATCH_SIZE, + ARANGO_DB_NAME, + ARANGO_PASSWORD, + ARANGO_URL, + ARANGO_USERNAME, + HUGGINGFACEHUB_API_TOKEN, + INSERT_ASYNC, + NODE_PROPERTIES, + OPENAI_API_KEY, + OPENAI_EMBED_DIMENSIONS, + OPENAI_EMBED_MODEL, + RELATIONSHIP_PROPERTIES, + SYSTEM_PROMPT_PATH, + TEI_EMBED_MODEL, + TEI_EMBEDDING_ENDPOINT, + TGI_LLM_ENDPOINT, + TGI_LLM_MAX_NEW_TOKENS, + TGI_LLM_TEMPERATURE, + TGI_LLM_TIMEOUT, + TGI_LLM_TOP_K, + TGI_LLM_TOP_P, + USE_ONE_ENTITY_COLLECTION, +) +from fastapi import File, Form, HTTPException, UploadFile +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceHubEmbeddings +from langchain_community.graphs.arangodb_graph import ArangoGraph +from langchain_community.llms import HuggingFaceEndpoint +from langchain_core.documents import Document +from langchain_core.prompts import ChatPromptTemplate +from langchain_experimental.graph_transformers import LLMGraphTransformer +from langchain_openai import ChatOpenAI, OpenAIEmbeddings +from langchain_text_splitters import HTMLHeaderTextSplitter + +from comps import CustomLogger, DocPath, opea_microservices, register_microservice +from comps.dataprep.utils import ( + document_loader, + encode_filename, + get_separators, + get_tables_result, + parse_html, + save_content_to_local_disk, +) + +logger = CustomLogger("prepare_doc_arango") +logflag = os.getenv("LOGFLAG", True) + +upload_folder = "./uploaded_files/" + +PROMPT_TEMPLATE = None +if SYSTEM_PROMPT_PATH is not None: + try: + with open(SYSTEM_PROMPT_PATH, "r") as f: + PROMPT_TEMPLATE = ChatPromptTemplate.from_messages( + [ + ( + "system", + f.read(), + ), + ( + "human", + ( + "Tip: Make sure to answer in the correct format and do " + "not include any explanations. " + "Use the given format to extract information from the " + "following input: {input}" + ), + ), + ] + ) + except Exception as e: + logger.error(f"Could not set custom Prompt: {e}") + + +def ingest_data_to_arango(doc_path: DocPath, graph_name: str, create_embeddings: bool) -> bool: + """Ingest document to ArangoDB.""" + path = doc_path.path + if logflag: + logger.info(f"Parsing document {path}.") + + ############################# + # Text Generation Inference # + ############################# + + if OPENAI_API_KEY: + if logflag: + logger.info("OpenAI API Key is set. Verifying its validity...") + openai.api_key = OPENAI_API_KEY + + try: + openai.models.list() + if logflag: + logger.info("OpenAI API Key is valid.") + llm = ChatOpenAI(temperature=0, model_name="gpt-4o") + except openai.error.AuthenticationError: + if logflag: + logger.info("OpenAI API Key is invalid.") + except Exception as e: + if logflag: + logger.info(f"An error occurred while verifying the API Key: {e}") + + elif TGI_LLM_ENDPOINT: + llm = HuggingFaceEndpoint( + endpoint_url=TGI_LLM_ENDPOINT, + max_new_tokens=TGI_LLM_MAX_NEW_TOKENS, + top_k=TGI_LLM_TOP_K, + top_p=TGI_LLM_TOP_P, + temperature=TGI_LLM_TEMPERATURE, + timeout=TGI_LLM_TIMEOUT, + ) + else: + raise ValueError("No text generation inference endpoint is set.") + + try: + llm_transformer = LLMGraphTransformer( + llm=llm, + allowed_nodes=ALLOWED_NODES, + allowed_relationships=ALLOWED_RELATIONSHIPS, + prompt=PROMPT_TEMPLATE, + node_properties=NODE_PROPERTIES if NODE_PROPERTIES else False, + relationship_properties=RELATIONSHIP_PROPERTIES if RELATIONSHIP_PROPERTIES else False, + ) + except (TypeError, ValueError) as e: + if logflag: + logger.warning(f"Advanced LLMGraphTransformer failed: {e}") + # Fall back to basic config + try: + llm_transformer = LLMGraphTransformer(llm=llm) + except (TypeError, ValueError) as e: + if logflag: + logger.error(f"Failed to initialize LLMGraphTransformer: {e}") + raise + + ######################################## + # Text Embeddings Inference (optional) # + ######################################## + + embeddings = None + if create_embeddings: + if OPENAI_API_KEY: + # Use OpenAI embeddings + embeddings = OpenAIEmbeddings( + model=OPENAI_EMBED_MODEL, + dimensions=OPENAI_EMBED_DIMENSIONS, + ) + + elif TEI_EMBEDDING_ENDPOINT and HUGGINGFACEHUB_API_TOKEN: + # Use TEI endpoint service + embeddings = HuggingFaceHubEmbeddings( + model=TEI_EMBEDDING_ENDPOINT, + huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN, + ) + elif TEI_EMBED_MODEL: + # Use local embedding model + embeddings = HuggingFaceBgeEmbeddings(model_name=TEI_EMBED_MODEL) + else: + if logflag: + logger.warning("No embeddings environment variables are set, cannot generate embeddings.") + embeddings = None + + ############ + # ArangoDB # + ############ + + client = ArangoClient(hosts=ARANGO_URL) + sys_db = client.db(name="_system", username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + if not sys_db.has_database(ARANGO_DB_NAME): + sys_db.create_database(ARANGO_DB_NAME) + + db = client.db(name=ARANGO_DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + graph = ArangoGraph( + db=db, + include_examples=False, + generate_schema_on_init=False, + ) + + ############ + # Chunking # + ############ + + if path.endswith(".html"): + headers_to_split_on = [ + ("h1", "Header 1"), + ("h2", "Header 2"), + ("h3", "Header 3"), + ] + text_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on) + else: + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=doc_path.chunk_size, + chunk_overlap=doc_path.chunk_overlap, + add_start_index=True, + separators=get_separators(), + ) + + content = document_loader(path) + + structured_types = [".xlsx", ".csv", ".json", "jsonl"] + _, ext = os.path.splitext(path) + + if ext in structured_types: + chunks = content + else: + chunks = text_splitter.split_text(content) + + if doc_path.process_table and path.endswith(".pdf"): + table_chunks = get_tables_result(path, doc_path.table_strategy) + if isinstance(table_chunks, list): + chunks = chunks + table_chunks + if logflag: + logger.info("Done preprocessing. Created ", len(chunks), " chunks of the original file.") + + ################################ + # Graph generation & insertion # + ################################ + + generate_chunk_embeddings = embeddings is not None + + for text in chunks: + document = Document(page_content=text) + graph_doc = llm_transformer.process_response(document) + + if generate_chunk_embeddings: + source = graph_doc.source + source.metadata["embeddings"] = embeddings.embed_documents([source.page_content])[0] + + graph.add_graph_documents( + graph_documents=[graph_doc], + include_source=True, + graph_name=graph_name, + update_graph_definition_if_exists=not USE_ONE_ENTITY_COLLECTION, + batch_size=ARANGO_BATCH_SIZE, + use_one_entity_collection=USE_ONE_ENTITY_COLLECTION, + insert_async=INSERT_ASYNC, + ) + + if logflag: + logger.info("The graph is built.") + + return True + + +@register_microservice( + name="opea_service@prepare_doc_arango", + endpoint="/v1/dataprep", + host="0.0.0.0", + port=6007, + input_datatype=DocPath, + output_datatype=None, +) +async def ingest_documents( + files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), + link_list: Optional[str] = Form(None), + chunk_size: int = Form(1500), + chunk_overlap: int = Form(100), + process_table: bool = Form(False), + table_strategy: str = Form("fast"), + graph_name: str = Form("Graph"), + create_embeddings: bool = Form(True), +): + if logflag: + logger.info(f"files:{files}") + logger.info(f"link_list:{link_list}") + + if files: + if not isinstance(files, list): + files = [files] + uploaded_files = [] + for file in files: + encode_file = encode_filename(file.filename) + save_path = upload_folder + encode_file + await save_content_to_local_disk(save_path, file) + ingest_data_to_arango( + DocPath( + path=save_path, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + process_table=process_table, + table_strategy=table_strategy, + ), + graph_name=graph_name, + create_embeddings=create_embeddings, + ) + uploaded_files.append(save_path) + if logflag: + logger.info(f"Successfully saved file {save_path}") + result = {"status": 200, "message": "Data preparation succeeded"} + if logflag: + logger.info(result) + return result + + if link_list: + link_list = json.loads(link_list) # Parse JSON string to list + if not isinstance(link_list, list): + raise HTTPException(status_code=400, detail="link_list should be a list.") + for link in link_list: + encoded_link = encode_filename(link) + save_path = upload_folder + encoded_link + ".txt" + content = parse_html([link])[0][0] + try: + await save_content_to_local_disk(save_path, content) + ingest_data_to_arango( + DocPath( + path=save_path, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + process_table=process_table, + table_strategy=table_strategy, + ), + graph_name=graph_name, + create_embeddings=create_embeddings, + ) + except json.JSONDecodeError: + raise HTTPException(status_code=500, detail="Fail to ingest data into qdrant.") + + if logflag: + logger.info(f"Successfully saved link {link}") + + result = {"status": 200, "message": "Data preparation succeeded"} + if logflag: + logger.info(result) + return result + + raise HTTPException(status_code=400, detail="Must provide either a file or a string list.") + + +if __name__ == "__main__": + opea_microservices["opea_service@prepare_doc_arango"].start() diff --git a/comps/dataprep/arango/langchain/requirements.txt b/comps/dataprep/arango/langchain/requirements.txt new file mode 100644 index 000000000..74d4a9f0d --- /dev/null +++ b/comps/dataprep/arango/langchain/requirements.txt @@ -0,0 +1,32 @@ +beautifulsoup4 +cairosvg +docarray[full] +docx2txt +easyocr +fastapi +huggingface_hub +langchain +git+https://github.com/arangoml/langchain.git@arangodb#subdirectory=libs/community +langchain-experimental +langchain-openai +langchain-text-splitters +langchain_huggingface +markdown +python-arango +cityhash +numpy +openai +opentelemetry-api +opentelemetry-exporter-otlp +opentelemetry-sdk +pandas +Pillow +prometheus-fastapi-instrumentator +pymupdf +pytesseract +python-docx +python-pptx +sentence_transformers +shortuuid +unstructured[all-docs]==0.15.7 +uvicorn From 32445331c07b063a61db1ff35aa5a8be923c5bc4 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Mon, 30 Dec 2024 14:18:29 -0500 Subject: [PATCH 17/22] new: `source_metadata_fields_to_extract_to_top_level` --- comps/dataprep/arango/langchain/prepare_doc_arango.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/comps/dataprep/arango/langchain/prepare_doc_arango.py b/comps/dataprep/arango/langchain/prepare_doc_arango.py index d2d467cff..77e73c00c 100644 --- a/comps/dataprep/arango/langchain/prepare_doc_arango.py +++ b/comps/dataprep/arango/langchain/prepare_doc_arango.py @@ -236,7 +236,7 @@ def ingest_data_to_arango(doc_path: DocPath, graph_name: str, create_embeddings: if generate_chunk_embeddings: source = graph_doc.source - source.metadata["embeddings"] = embeddings.embed_documents([source.page_content])[0] + source.metadata["embedding"] = embeddings.embed_documents([source.page_content])[0] graph.add_graph_documents( graph_documents=[graph_doc], @@ -246,6 +246,7 @@ def ingest_data_to_arango(doc_path: DocPath, graph_name: str, create_embeddings: batch_size=ARANGO_BATCH_SIZE, use_one_entity_collection=USE_ONE_ENTITY_COLLECTION, insert_async=INSERT_ASYNC, + source_metadata_fields_to_extract_to_top_level={"embedding"}, ) if logflag: From 3604fb83c0f027cad2b883e5ab7250a6b3f69db9 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Mon, 30 Dec 2024 17:59:31 -0500 Subject: [PATCH 18/22] fix: logger info --- comps/dataprep/arango/langchain/prepare_doc_arango.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comps/dataprep/arango/langchain/prepare_doc_arango.py b/comps/dataprep/arango/langchain/prepare_doc_arango.py index 77e73c00c..88b3b5986 100644 --- a/comps/dataprep/arango/langchain/prepare_doc_arango.py +++ b/comps/dataprep/arango/langchain/prepare_doc_arango.py @@ -222,7 +222,7 @@ def ingest_data_to_arango(doc_path: DocPath, graph_name: str, create_embeddings: if isinstance(table_chunks, list): chunks = chunks + table_chunks if logflag: - logger.info("Done preprocessing. Created ", len(chunks), " chunks of the original file.") + logger.info(f"Done preprocessing. Created {len(chunks)} chunks of the original file.") ################################ # Graph generation & insertion # From 50b26397eafd836d0cce24418d3c60c6472fad3d Mon Sep 17 00:00:00 2001 From: Anthony Mahanna <43019056+aMahanna@users.noreply.github.com> Date: Sun, 5 Jan 2025 17:25:53 -0500 Subject: [PATCH 19/22] ArangoDB: Retriever (#2) * wip: retriever * rename: `arango` * checkpoint * cleanup * fix: env * update retriever compose * add test file * fix: config & dockerfile * fix: embedding field name * new: config variables * new: traverse graph after similarity * fix: string * add `uniqueVertices` * add filter * infra * fix: query * remove: `similarity_distance_threshold` * temp: replace `p` * cleanup * remove: `ARANGO_TRAVERSAL_MIN_DEPTH` * update max_depth * new: `fetch_neighborhoods` * fix: test * cleanup: `prepare_doc_arango.py` * move `graph` & `vector_db` instantiation * cleanup: dataprep readme * cleanup: retriever * fix: arango test scripts * Update test_retrievers_arango_langchain.sh * update `ARANGO_EMBEDDING_DIMENSION` * fix: env vars * cleanup: retriever port * new: `test_dataprep_arango_langchain` * new: retriever yaml * Changing naming convention from arangodb to arango to ensure consistency between microservices, updated dockerfile to match and removed space in port * fix: retriever name * remove: `retriever_arangodb` --------- Co-authored-by: Ajay Kallepalli --- .../docker/compose/retrievers-compose.yaml | 4 + comps/dataprep/arango/langchain/README.md | 42 +-- comps/dataprep/arango/langchain/config.py | 4 +- .../arango/langchain/prepare_doc_arango.py | 205 +++++++------- comps/retrievers/README.md | 4 + comps/retrievers/arango/__init__.py | 0 comps/retrievers/arango/langchain/Dockerfile | 34 +++ comps/retrievers/arango/langchain/README.md | 144 ++++++++++ comps/retrievers/arango/langchain/__init__.py | 2 + comps/retrievers/arango/langchain/config.py | 32 +++ .../docker-compose-retriever-arango.yaml | 54 ++++ .../arango/langchain/requirements.txt | 22 ++ .../arango/langchain/retriever_arango.py | 249 ++++++++++++++++++ tests/chathistory/test_chathistory_arango.sh | 2 +- .../test_dataprep_arango_langchain.sh | 105 ++++++++ .../test_feedback_management_arango.sh | 2 +- .../test_prompt_registry_arango.sh | 2 +- .../test_retrievers_arango_langchain.sh | 126 +++++++++ 18 files changed, 906 insertions(+), 127 deletions(-) create mode 100644 comps/retrievers/arango/__init__.py create mode 100644 comps/retrievers/arango/langchain/Dockerfile create mode 100644 comps/retrievers/arango/langchain/README.md create mode 100644 comps/retrievers/arango/langchain/__init__.py create mode 100644 comps/retrievers/arango/langchain/config.py create mode 100644 comps/retrievers/arango/langchain/docker-compose-retriever-arango.yaml create mode 100644 comps/retrievers/arango/langchain/requirements.txt create mode 100644 comps/retrievers/arango/langchain/retriever_arango.py create mode 100644 tests/dataprep/test_dataprep_arango_langchain.sh create mode 100644 tests/retrievers/test_retrievers_arango_langchain.sh diff --git a/.github/workflows/docker/compose/retrievers-compose.yaml b/.github/workflows/docker/compose/retrievers-compose.yaml index 7b89ce9bf..bfc8a29a5 100644 --- a/.github/workflows/docker/compose/retrievers-compose.yaml +++ b/.github/workflows/docker/compose/retrievers-compose.yaml @@ -47,3 +47,7 @@ services: build: dockerfile: comps/retrievers/neo4j/llama_index/Dockerfile image: ${REGISTRY:-opea}/retriever-neo4j-llamaindex:${TAG:-latest} + retriever-arango: + build: + dockerfile: comps/retrievers/arango/langchain/Dockerfile + image: ${REGISTRY:-opea}/retriever-arango:${TAG:-latest} \ No newline at end of file diff --git a/comps/dataprep/arango/langchain/README.md b/comps/dataprep/arango/langchain/README.md index 37f0a078b..fb383b42e 100644 --- a/comps/dataprep/arango/langchain/README.md +++ b/comps/dataprep/arango/langchain/README.md @@ -1,6 +1,6 @@ # Dataprep Microservice with ArangoDB -## πŸš€Start Microservice with Python +## πŸš€ 1. Start Microservice with Python ### Install Requirements @@ -31,27 +31,27 @@ export ARANGO_DB_NAME=${your_db_name} export PYTHONPATH=${path_to_comps} ``` -### Start Document Preparation Microservice for ArangoDB with Python Script +See below for additional environment variables that can be set. -Start document preparation microservice for ArangoDB with below command. +### Start Dataprep Service ```bash python prepare_doc_arango.py ``` -## πŸš€Start Microservice with Docker +## πŸš€ 2. Start Microservice with Docker ### Build Docker Image ```bash -cd ../../../../ +cd /your/path/to/GenAIComps docker build -t opea/dataprep-arango:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/arango/langchain/Dockerfile . ``` ### Run Docker with CLI ```bash -docker run -d --name="dataprep-arango-server" -p 6007:6007 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/dataprep-arango:latest +docker run -d --name="dataprep-arango-server" -p 6007:6007 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e ... opea/dataprep-arango:latest ``` ### Run Docker with Docker Compose @@ -61,13 +61,9 @@ cd comps/dataprep/arango/langchain docker compose -f docker-compose-dataprep-arango.yaml up -d ``` -## Invoke Microservice +## πŸš€ 3. Consume Retriever Service -Once document preparation microservice for ArangoDB is started, user can use below command to invoke the microservice to convert the document to embedding and save to the database. - -After the service is complete a Graph is created in ArangoDB. The default graph name is `Graph`, you can specify the graph name by `-F "graph_name=${your_graph_name}"` in the curl command. - -By default, the microservice will create embeddings for the documents if embedding environment variables are specified. You can specify `-F "create_embeddings=false"` to skip the embedding creation. +An ArangoDB Graph is created from the documents provided to the microservice. The microservice will extract entities from the documents and create nodes and relationships in the graph based on the entities extracted. The microservice will also create embeddings for the documents if embedding environment variables are specified. ```bash curl -X POST \ @@ -77,7 +73,11 @@ curl -X POST \ http://localhost:6007/v1/dataprep ``` -You can specify chunk_size and chunk_size by the following commands. +You can specify the graph name with `-F "graph_name=${your_graph_name}"` in the curl command. + +By default, the microservice will create embeddings for the documents if embedding environment variables are specified. You can specify `-F "create_embeddings=false"` to skip document embedding creation. + +You can also specify the `chunk_size` and `chunk_overlap` with the following parameters: ```bash curl -X POST \ @@ -89,11 +89,11 @@ curl -X POST \ http://localhost:6007/v1/dataprep ``` -We support table extraction from pdf documents. You can specify process_table and table_strategy by the following commands. "table_strategy" refers to the strategies to understand tables for table retrieval. As the setting progresses from "fast" to "hq" to "llm," the focus shifts towards deeper table understanding at the expense of processing speed. The default strategy is "fast". - -Note: If you specify "table_strategy=llm", You should first start TGI Service, please refer to 1.2.1, 1.3.1 in https://github.com/opea-project/GenAIComps/tree/main/comps/llms/README.md, and then `export TGI_LLM_ENDPOINT="http://${your_ip}:8008"`. +We support table extraction from pdf documents. You can specify `process_table` and `table_strategy` with the following parameters: +- `table_strategy` refers to the strategies to understand tables for table retrieval. As the setting progresses from `"fast"` to `"hq"` to `"llm"`, the focus shifts towards deeper table understanding at the expense of processing speed. The default strategy is `"fast"`. +- `process_table` refers to whether to process tables in the document. The default value is `False`. -For ensure the quality and comprehensiveness of the extracted entities, we recommend to use `gpt-4o` as the default model for parsing the document. To enable the openai service, please `export OPENAI_API_KEY=xxxx` before using this services. +Note: If you specify `"table_strategy=llm"`, you should first start the TGI Service. Please refer to 1.2.1, 1.3.1 in https://github.com/opea-project/GenAIComps/tree/main/comps/llms/README.md, and then `export TGI_LLM_ENDPOINT="http://${your_ip}:8008"`. ```bash curl -X POST \ @@ -107,13 +107,15 @@ curl -X POST \ --- -Additional options that can be specified from the environment variables are as follows (default values are in the config.py file): +Additional options that can be specified from the environment variables are as follows (default values are also in the `config.py` file): -ArangoDB Configuration: +ArangoDB Connection configuration - `ARANGO_URL`: The URL for the ArangoDB service. - `ARANGO_USERNAME`: The username for the ArangoDB service. - `ARANGO_PASSWORD`: The password for the ArangoDB service. - `ARANGO_DB_NAME`: The name of the database to use for the ArangoDB service. + +ArangoDB Graph Insertion configuration - `USE_ONE_ENTITY_COLLECTION`: If set to True, the microservice will use a single entity collection for all nodes. If set to False, the microservice will use a separate collection by node type. Defaults to `True`. - `INSERT_ASYNC`: If set to True, the microservice will insert the data into ArangoDB asynchronously. Defaults to `False`. - `ARANGO_BATCH_SIZE`: The batch size for the microservice to insert the data. Defaults to `500`. @@ -127,7 +129,7 @@ Text Generation Inference Configuration - `TGI_LLM_TIMEOUT`: The timeout for the TGI service. Defaults to `600`. Text Embeddings Inferencing Configuration -**Note**: This is optional functionality to generate embeddings for text chunks. +**Note**: This is optional functionality to generate embeddings for documents (i.e text chunks). - `TEI_EMBEDDING_ENDPOINT`: The endpoint for the TEI service. - `HUGGINGFACEHUB_API_TOKEN`: The API token for the Hugging Face Hub. - `TEI_EMBED_MODEL`: The model to use for the TEI service. Defaults to `BAAI/bge-base-en-v1.5`. diff --git a/comps/dataprep/arango/langchain/config.py b/comps/dataprep/arango/langchain/config.py index 1f2312e59..c3caf2da2 100644 --- a/comps/dataprep/arango/langchain/config.py +++ b/comps/dataprep/arango/langchain/config.py @@ -3,13 +3,13 @@ import os -# ArangoDB configuration +# ArangoDB Connection configuration ARANGO_URL = os.getenv("ARANGO_URL", "http://localhost:8529") ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD", "test") ARANGO_DB_NAME = os.getenv("ARANGO_DB_NAME", "_system") -# ArangoDB graph configuration +# ArangoDB Graph Insertion configuration USE_ONE_ENTITY_COLLECTION = os.getenv("USE_ONE_ENTITY_COLLECTION", True) INSERT_ASYNC = os.getenv("INSERT_ASYNC", False) ARANGO_BATCH_SIZE = os.getenv("ARANGO_BATCH_SIZE", 500) diff --git a/comps/dataprep/arango/langchain/prepare_doc_arango.py b/comps/dataprep/arango/langchain/prepare_doc_arango.py index 88b3b5986..a89984f08 100644 --- a/comps/dataprep/arango/langchain/prepare_doc_arango.py +++ b/comps/dataprep/arango/langchain/prepare_doc_arango.py @@ -19,6 +19,8 @@ INSERT_ASYNC, NODE_PROPERTIES, OPENAI_API_KEY, + OPENAI_CHAT_MODEL, + OPENAI_CHAT_TEMPERATURE, OPENAI_EMBED_DIMENSIONS, OPENAI_EMBED_MODEL, RELATIONSHIP_PROPERTIES, @@ -84,110 +86,13 @@ logger.error(f"Could not set custom Prompt: {e}") -def ingest_data_to_arango(doc_path: DocPath, graph_name: str, create_embeddings: bool) -> bool: +def ingest_data_to_arango(doc_path: DocPath, graph_name: str, generate_chunk_embeddings: bool) -> bool: """Ingest document to ArangoDB.""" path = doc_path.path + if logflag: logger.info(f"Parsing document {path}.") - ############################# - # Text Generation Inference # - ############################# - - if OPENAI_API_KEY: - if logflag: - logger.info("OpenAI API Key is set. Verifying its validity...") - openai.api_key = OPENAI_API_KEY - - try: - openai.models.list() - if logflag: - logger.info("OpenAI API Key is valid.") - llm = ChatOpenAI(temperature=0, model_name="gpt-4o") - except openai.error.AuthenticationError: - if logflag: - logger.info("OpenAI API Key is invalid.") - except Exception as e: - if logflag: - logger.info(f"An error occurred while verifying the API Key: {e}") - - elif TGI_LLM_ENDPOINT: - llm = HuggingFaceEndpoint( - endpoint_url=TGI_LLM_ENDPOINT, - max_new_tokens=TGI_LLM_MAX_NEW_TOKENS, - top_k=TGI_LLM_TOP_K, - top_p=TGI_LLM_TOP_P, - temperature=TGI_LLM_TEMPERATURE, - timeout=TGI_LLM_TIMEOUT, - ) - else: - raise ValueError("No text generation inference endpoint is set.") - - try: - llm_transformer = LLMGraphTransformer( - llm=llm, - allowed_nodes=ALLOWED_NODES, - allowed_relationships=ALLOWED_RELATIONSHIPS, - prompt=PROMPT_TEMPLATE, - node_properties=NODE_PROPERTIES if NODE_PROPERTIES else False, - relationship_properties=RELATIONSHIP_PROPERTIES if RELATIONSHIP_PROPERTIES else False, - ) - except (TypeError, ValueError) as e: - if logflag: - logger.warning(f"Advanced LLMGraphTransformer failed: {e}") - # Fall back to basic config - try: - llm_transformer = LLMGraphTransformer(llm=llm) - except (TypeError, ValueError) as e: - if logflag: - logger.error(f"Failed to initialize LLMGraphTransformer: {e}") - raise - - ######################################## - # Text Embeddings Inference (optional) # - ######################################## - - embeddings = None - if create_embeddings: - if OPENAI_API_KEY: - # Use OpenAI embeddings - embeddings = OpenAIEmbeddings( - model=OPENAI_EMBED_MODEL, - dimensions=OPENAI_EMBED_DIMENSIONS, - ) - - elif TEI_EMBEDDING_ENDPOINT and HUGGINGFACEHUB_API_TOKEN: - # Use TEI endpoint service - embeddings = HuggingFaceHubEmbeddings( - model=TEI_EMBEDDING_ENDPOINT, - huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN, - ) - elif TEI_EMBED_MODEL: - # Use local embedding model - embeddings = HuggingFaceBgeEmbeddings(model_name=TEI_EMBED_MODEL) - else: - if logflag: - logger.warning("No embeddings environment variables are set, cannot generate embeddings.") - embeddings = None - - ############ - # ArangoDB # - ############ - - client = ArangoClient(hosts=ARANGO_URL) - sys_db = client.db(name="_system", username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) - - if not sys_db.has_database(ARANGO_DB_NAME): - sys_db.create_database(ARANGO_DB_NAME) - - db = client.db(name=ARANGO_DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) - - graph = ArangoGraph( - db=db, - include_examples=False, - generate_schema_on_init=False, - ) - ############ # Chunking # ############ @@ -221,6 +126,7 @@ def ingest_data_to_arango(doc_path: DocPath, graph_name: str, create_embeddings: table_chunks = get_tables_result(path, doc_path.table_strategy) if isinstance(table_chunks, list): chunks = chunks + table_chunks + if logflag: logger.info(f"Done preprocessing. Created {len(chunks)} chunks of the original file.") @@ -228,7 +134,11 @@ def ingest_data_to_arango(doc_path: DocPath, graph_name: str, create_embeddings: # Graph generation & insertion # ################################ - generate_chunk_embeddings = embeddings is not None + graph = ArangoGraph( + db=db, + include_examples=False, + generate_schema_on_init=False, + ) for text in chunks: document = Document(page_content=text) @@ -294,7 +204,7 @@ async def ingest_documents( table_strategy=table_strategy, ), graph_name=graph_name, - create_embeddings=create_embeddings, + generate_chunk_embeddings=create_embeddings and embeddings is not None, ) uploaded_files.append(save_path) if logflag: @@ -323,7 +233,7 @@ async def ingest_documents( table_strategy=table_strategy, ), graph_name=graph_name, - create_embeddings=create_embeddings, + generate_chunk_embeddings=create_embeddings and embeddings is not None, ) except json.JSONDecodeError: raise HTTPException(status_code=500, detail="Fail to ingest data into qdrant.") @@ -340,4 +250,95 @@ async def ingest_documents( if __name__ == "__main__": + + ############################# + # Text Generation Inference # + ############################# + + if OPENAI_API_KEY: + if logflag: + logger.info("OpenAI API Key is set. Verifying its validity...") + openai.api_key = OPENAI_API_KEY + + try: + openai.models.list() + if logflag: + logger.info("OpenAI API Key is valid.") + llm = ChatOpenAI(temperature=OPENAI_CHAT_TEMPERATURE, model_name=OPENAI_CHAT_MODEL) + except openai.error.AuthenticationError: + if logflag: + logger.info("OpenAI API Key is invalid.") + except Exception as e: + if logflag: + logger.info(f"An error occurred while verifying the API Key: {e}") + + elif TGI_LLM_ENDPOINT: + llm = HuggingFaceEndpoint( + endpoint_url=TGI_LLM_ENDPOINT, + max_new_tokens=TGI_LLM_MAX_NEW_TOKENS, + top_k=TGI_LLM_TOP_K, + top_p=TGI_LLM_TOP_P, + temperature=TGI_LLM_TEMPERATURE, + timeout=TGI_LLM_TIMEOUT, + ) + else: + raise ValueError("No text generation inference endpoint is set.") + + try: + llm_transformer = LLMGraphTransformer( + llm=llm, + allowed_nodes=ALLOWED_NODES, + allowed_relationships=ALLOWED_RELATIONSHIPS, + prompt=PROMPT_TEMPLATE, + node_properties=NODE_PROPERTIES or False, + relationship_properties=RELATIONSHIP_PROPERTIES or False, + ) + except (TypeError, ValueError) as e: + if logflag: + logger.warning(f"Advanced LLMGraphTransformer failed: {e}") + # Fall back to basic config + try: + llm_transformer = LLMGraphTransformer(llm=llm) + except (TypeError, ValueError) as e: + if logflag: + logger.error(f"Failed to initialize LLMGraphTransformer: {e}") + raise + + ######################################## + # Text Embeddings Inference (optional) # + ######################################## + + if OPENAI_API_KEY: + # Use OpenAI embeddings + embeddings = OpenAIEmbeddings( + model=OPENAI_EMBED_MODEL, + dimensions=OPENAI_EMBED_DIMENSIONS, + ) + + elif TEI_EMBEDDING_ENDPOINT and HUGGINGFACEHUB_API_TOKEN: + # Use TEI endpoint service + embeddings = HuggingFaceHubEmbeddings( + model=TEI_EMBEDDING_ENDPOINT, + huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN, + ) + elif TEI_EMBED_MODEL: + # Use local embedding model + embeddings = HuggingFaceBgeEmbeddings(model_name=TEI_EMBED_MODEL) + else: + if logflag: + logger.warning("No embeddings environment variables are set, cannot generate embeddings.") + embeddings = None + + ############ + # ArangoDB # + ############ + + client = ArangoClient(hosts=ARANGO_URL) + sys_db = client.db(name="_system", username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + if not sys_db.has_database(ARANGO_DB_NAME): + sys_db.create_database(ARANGO_DB_NAME) + + db = client.db(name=ARANGO_DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + opea_microservices["opea_service@prepare_doc_arango"].start() diff --git a/comps/retrievers/README.md b/comps/retrievers/README.md index eeba8860e..6ae15db0e 100644 --- a/comps/retrievers/README.md +++ b/comps/retrievers/README.md @@ -33,3 +33,7 @@ For details, please refer to this [readme](vdms/langchain/README.md) ## Retriever Microservice with Multimodal For details, please refer to this [readme](multimodal/redis/langchain/README.md) + +## Retriever Microservice with ArangoDB + +For details, please refer to this [readme](arango/langchain/README.md) diff --git a/comps/retrievers/arango/__init__.py b/comps/retrievers/arango/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/comps/retrievers/arango/langchain/Dockerfile b/comps/retrievers/arango/langchain/Dockerfile new file mode 100644 index 000000000..27a04ccb9 --- /dev/null +++ b/comps/retrievers/arango/langchain/Dockerfile @@ -0,0 +1,34 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +ENV LANG=C.UTF-8 + +ENV HUGGINGFACEHUB_API_TOKEN=dummy + +ARG ARCH="cpu" + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + build-essential \ + libgl1-mesa-glx \ + libjemalloc-dev \ + git + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps + +RUN pip install --no-cache-dir --upgrade pip setuptools && \ + if [ ${ARCH} = "cpu" ]; then pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu; fi && \ + pip install --no-cache-dir -r /home/user/comps/retrievers/arango/langchain/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +WORKDIR /home/user/comps/retrievers/arango/langchain + +ENTRYPOINT ["python", "retriever_arango.py"] diff --git a/comps/retrievers/arango/langchain/README.md b/comps/retrievers/arango/langchain/README.md new file mode 100644 index 000000000..66b6e74bc --- /dev/null +++ b/comps/retrievers/arango/langchain/README.md @@ -0,0 +1,144 @@ +# Retriever Microservice with ArangoDB + +## πŸš€ 1. Start Microservice with Python + +### Install Requirements + +```bash +pip install -r requirements.txt +apt-get install libtesseract-dev -y +apt-get install poppler-utils -y +``` + +### Start ArangoDB Server + +To launch ArangoDB locally, first ensure you have docker installed. Then, you can launch the database with the following docker command. + +```bash +docker run -d --name arangodb -p 8529:8529 -e ARANGO_ROOT_PASSWORD=password arangodb/arangodb:latest +``` + +### Setup Environment Variables + +```bash +export no_proxy=${your_no_proxy} +export http_proxy=${your_http_proxy} +export https_proxy=${your_http_proxy} +export ARANGO_URL=${your_arango_url} +export ARANGO_USERNAME=${your_arango_username} +export ARANGO_PASSWORD=${your_arango_password} +export ARANGO_DB_NAME=${your_db_name} +export ARANGO_COLLECTION_NAME=${your_collection_name} +export ARANGO_EMBEDDING_DIMENSION=${your_embedding_dimension} +export PYTHONPATH=${path_to_comps} +``` + +See below for additional environment variables that can be set. + +### Start Retriever Service + +```bash +python retriever_arango.py +``` + +## πŸš€ 2. Start Microservice with Docker + +### Build Docker Image + +```bash +cd /your/path/to/GenAIComps +docker build -t opea/retriever-arango:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/arangodb/langchain/Dockerfile . +``` + +### Run Docker with CLI + +```bash +docker run -d --name="retriever-arango-server" -p 7000:7000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e ... opea/retriever-arango:latest +``` + +### Run Docker with Docker Compose + +```bash +cd /your/path/to/GenAIComps/comps/retriever/arango/langchain +docker compose -f docker-compose-retriever-arango.yaml up -d +``` + +## πŸš€ 3. Consume Retriever Service + +### 3.1 Check Service Status + +```bash +curl http://${your_ip}:7000/v1/health_check \ + -X GET \ + -H 'Content-Type: application/json' +``` + +### 3.2 Consume Embedding Service + +Assuming you have an ArangoDB Collection with the documents you want to retrieve from, you can consume the retriever service with the following curl command. + +```bash +curl http://${your_ip}:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":[]}" \ + -H 'Content-Type: application/json' +``` + +If `embedding` is not specified or is an empty list, the retriever will use the text to generate an embedding using the Embedding Environment variables provided. + +Additional parameters can be set for the retriever: + +```bash +curl http://localhost:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":[],\"search_type\":\"similarity\", \"k\":4}" \ + -H 'Content-Type: application/json' +``` + +```bash +curl http://localhost:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":[],\"search_type\":\"similarity_score_threshold\", \"k\":4, \"score_threshold\":0.2}" \ + -H 'Content-Type: application/json' +``` + +```bash +export your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") +curl http://localhost:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":[],\"search_type\":\"mmr\", \"k\":4, \"fetch_k\":20, \"lambda_mult\":0.5}" \ + -H 'Content-Type: application/json' +``` + +--- + +Additional options that can be specified from the environment variables are as follows (default values are also in the `config.py` file): + +ArangoDB Connection configuration +- `ARANGO_URL`: The URL for the ArangoDB service. +- `ARANGO_USERNAME`: The username for the ArangoDB service. +- `ARANGO_PASSWORD`: The password for the ArangoDB service. +- `ARANGO_DB_NAME`: The name of the database to use for the ArangoDB service. + +ArangoDB Collection configuration +- `ARANGO_COLLECTION_NAME`: The name of the collection containing the documents. +- `ARANGO_DISTANCE_STRATEGY`: The distance strategy to use for the embeddings. Options are `COSINE` and `L2` (euclidean distance). +- `ARANGO_USE_APPROX_SEARCH`: Whether to use approximate neighbor search. If False, exact search will be used (slower, but more accurate). If True, approximate search will be used (faster, but less accurate). Defaults to `True`. +- `ARANGO_TEXT_FIELD`: The document field name storing the text. +- `ARANGO_EMBEDDING_FIELD`: The document field name storing the embeddings. +- `ARANGO_EMBEDDING_DIMENSION`: The dimension of the document embeddings. +- `ARANGO_NUM_CENTROIDS`: The number of centroids to use for the approximate search. Defaults to `1`, which is essentially exhaustive search. + +ArangoDB Traversal configuration +- `ARANGO_TRAVERSAL_GRAPH_NAME`: If specified, the retriever will traverse the graph to retrieve the neighborhood of the retrieved documents. +- `ARANGO_TRAVERSAL_MAX_DEPTH`: The maximum depth to traverse the graph. Defaults to `1`. + +Embedding Configuration +- `TEI_EMBED_MODEL`: The model to use for the TEI service. Defaults to `BAAI/bge-base-en-v1.5`. +- `TEI_EMBEDDING_ENDPOINT`: The endpoint for the TEI service. +- `HUGGINGFACEHUB_API_TOKEN`: The API token for the Hugging Face Hub. + +OpenAI Configuration: +**Note**: This configuration can replace the TGI and TEI services for text generation and embeddings. +- `OPENAI_API_KEY`: The API key for the OpenAI service. +- `OPENAI_EMBED_MODEL`: The embedding model to use for the OpenAI service. Defaults to `text-embedding-3-small`. \ No newline at end of file diff --git a/comps/retrievers/arango/langchain/__init__.py b/comps/retrievers/arango/langchain/__init__.py new file mode 100644 index 000000000..916f3a44b --- /dev/null +++ b/comps/retrievers/arango/langchain/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/retrievers/arango/langchain/config.py b/comps/retrievers/arango/langchain/config.py new file mode 100644 index 000000000..d1df90cfd --- /dev/null +++ b/comps/retrievers/arango/langchain/config.py @@ -0,0 +1,32 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +# ArangoDB Connection configuration +ARANGO_URL = os.getenv("ARANGO_URL", "http://localhost:8529") +ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") +ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD", "test") +ARANGO_DB_NAME = os.getenv("ARANGO_DB_NAME", "_system") + +# ArangoDB Vector configuration +ARANGO_COLLECTION_NAME = os.getenv("ARANGO_COLLECTION_NAME", "Graph_SOURCE") +ARANGO_DISTANCE_STRATEGY = os.getenv("ARANGO_DISTANCE_STRATEGY", "COSINE") +ARANGO_USE_APPROX_SEARCH = os.getenv("ARANGO_USE_APPROX_SEARCH", True) +ARANGO_TEXT_FIELD = os.getenv("ARANGO_TEXT_FIELD", "text") +ARANGO_EMBEDDING_FIELD = os.getenv("ARANGO_EMBEDDING_FIELD", "embedding") +ARANGO_EMBEDDING_DIMENSION = os.getenv("ARANGO_EMBEDDING_DIMENSION") +ARANGO_NUM_CENTROIDS = os.getenv("ARANGO_NUM_CENTROIDS", 1) + +# ArangoDB Traversal configuration +ARANGO_TRAVERSAL_GRAPH_NAME = os.getenv("ARANGO_TRAVERSAL_GRAPH_NAME") +ARANGO_TRAVERSAL_MAX_DEPTH = os.getenv("ARANGO_TRAVERSAL_MAX_DEPTH", 1) + +# Embedding configuration +TEI_EMBED_MODEL = os.getenv("TEI_EMBED_MODEL", "BAAI/bge-base-en-v1.5") +TEI_EMBEDDING_ENDPOINT = os.getenv("TEI_EMBEDDING_ENDPOINT", "") +HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN") + +# OpenAI configuration (alternative to TEI & local model) +OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") +OPENAI_EMBED_MODEL = os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small") diff --git a/comps/retrievers/arango/langchain/docker-compose-retriever-arango.yaml b/comps/retrievers/arango/langchain/docker-compose-retriever-arango.yaml new file mode 100644 index 000000000..ffa86d28f --- /dev/null +++ b/comps/retrievers/arango/langchain/docker-compose-retriever-arango.yaml @@ -0,0 +1,54 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3" +services: + arango-vector-db: + image: jbajic/arango-preview:vector-index-preview-5 + container_name: arango-vector + ports: + - "8529:8529" + environment: + ARANGO_ROOT_PASSWORD: ${ARANGO_PASSWORD} + tei-embedding-service: + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + container_name: tei-embedding-server + ports: + - "6006:80" + volumes: + - "./data:/data" + shm_size: 1g + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate + retriever-arango: + image: opea/retriever-arango:latest + container_name: retriever-arango-server + depends_on: + - arango-vector-db + - tei-embedding-service + ports: + - "7000:7000" + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + ARANGO_URL: ${ARANGO_URL} + ARANGO_USERNAME: ${ARANGO_USERNAME} + ARANGO_PASSWORD: ${ARANGO_PASSWORD} + ARANGO_DB_NAME: ${ARANGO_DB_NAME} + ARANGO_COLLECTION_NAME: ${ARANGO_COLLECTION_NAME} + ARANGO_EMBEDDING_DIMENSION: ${ARANGO_EMBEDDING_DIMENSION} + TEI_EMBED_MODEL: ${TEI_EMBED_MODEL} + TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT} + HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + OPENAI_API_KEY: ${OPENAI_API_KEY} + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/retrievers/arango/langchain/requirements.txt b/comps/retrievers/arango/langchain/requirements.txt new file mode 100644 index 000000000..f1d40dbbe --- /dev/null +++ b/comps/retrievers/arango/langchain/requirements.txt @@ -0,0 +1,22 @@ +docarray[full] +fastapi +frontend +huggingface_hub +langchain +git+https://github.com/arangoml/langchain.git@arangodb#subdirectory=libs/community +langchain_openai +python-arango +numpy +opentelemetry-api +opentelemetry-exporter-otlp +opentelemetry-sdk +pandas +Pillow +prometheus-fastapi-instrumentator +pydantic +pymupdf +python-docx +sentence_transformers +shortuuid +tiktoken +uvicorn diff --git a/comps/retrievers/arango/langchain/retriever_arango.py b/comps/retrievers/arango/langchain/retriever_arango.py new file mode 100644 index 000000000..2a7b37412 --- /dev/null +++ b/comps/retrievers/arango/langchain/retriever_arango.py @@ -0,0 +1,249 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os +import time +from typing import Any, Optional, Union + +from arango import ArangoClient +from config import ( + ARANGO_COLLECTION_NAME, + ARANGO_DB_NAME, + ARANGO_DISTANCE_STRATEGY, + ARANGO_EMBEDDING_DIMENSION, + ARANGO_EMBEDDING_FIELD, + ARANGO_NUM_CENTROIDS, + ARANGO_PASSWORD, + ARANGO_TEXT_FIELD, + ARANGO_TRAVERSAL_GRAPH_NAME, + ARANGO_TRAVERSAL_MAX_DEPTH, + ARANGO_URL, + ARANGO_USE_APPROX_SEARCH, + ARANGO_USERNAME, + TEI_EMBEDDING_ENDPOINT, + TEI_EMBED_MODEL, + HUGGINGFACEHUB_API_TOKEN, + OPENAI_API_KEY, + OPENAI_EMBED_MODEL, +) +from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceHubEmbeddings +from langchain_community.vectorstores.arangodb_vector import ArangoVector +from langchain_openai import OpenAIEmbeddings + +from comps import ( + CustomLogger, + EmbedDoc, + SearchedDoc, + ServiceType, + TextDoc, + opea_microservices, + register_microservice, + register_statistics, + statistics_dict, +) +from comps.cores.proto.api_protocol import ( + ChatCompletionRequest, + RetrievalRequest, + RetrievalResponse, + RetrievalResponseData, +) + + +class ArangoTextDoc(TextDoc): + neighborhood: Optional[list[dict[str, Any]]] = None + + +class ArangoRetrievalResponseData(RetrievalResponseData): + neighborhood: Optional[list[dict[str, Any]]] = None + + +logger = CustomLogger("retriever_arango") +logflag = os.getenv("LOGFLAG", False) + + +def fetch_neighborhoods( + vector_db: ArangoVector, + keys: list[str], + neighborhoods: dict[str, Any], + graph_name: str, + source_collection_name: str, + max_depth: int, +) -> None: + """Fetch neighborhoods of source documents. Updates the neighborhoods dictionary in-place.""" + if not vector_db.db.has_graph(graph_name): + logger.error("Graph not found in database.") + return + + graph = vector_db.db.graph(graph_name) + + if not graph.has_edge_collection(f"{graph_name}_HAS_SOURCE"): + logger.error(f"Edge collection '{graph_name}_HAS_SOURCE' not found in graph.") + return + + if not graph.has_edge_collection(f"{graph_name}_LINKS_TO"): + logger.error(f"Edge collection '{graph_name}_LINKS_TO' not found in graph.") + return + + if max_depth < 1: + max_depth = 1 + + # TODO: Consider using general `GRAPH` syntax instead of specific edge collections... + aql = f""" + FOR doc IN @@collection + FILTER doc._key IN @keys + + LET entity_neighborhood = ( + FOR v1, e1, p1 IN 1..1 INBOUND doc {graph_name}_HAS_SOURCE + FOR v2, e2, p2 IN 1..{max_depth} ANY v1 {graph_name}_LINKS_TO + RETURN p2 + ) + + RETURN {{[doc._key]: entity_neighborhood}} + """ + + bind_vars = { + "@collection": source_collection_name, + "keys": keys, + } + + cursor = vector_db.db.aql.execute(aql, bind_vars=bind_vars) + + for doc in cursor: + neighborhoods.update(doc) + + +@register_microservice( + name="opea_service@retriever_arango", + service_type=ServiceType.RETRIEVER, + endpoint="/v1/retrieval", + host="0.0.0.0", + port=7000, +) +@register_statistics(names=["opea_service@retriever_arango"]) +async def retrieve( + input: Union[EmbedDoc, RetrievalRequest, ChatCompletionRequest] +) -> Union[SearchedDoc, RetrievalResponse, ChatCompletionRequest]: + if logflag: + logger.info(input) + + start = time.time() + + query = input.text if isinstance(input, EmbedDoc) else input.input + embedding = input.embedding if isinstance(input.embedding, list) else None + + vector_db = ArangoVector( + embedding=embeddings, + embedding_dimension=ARANGO_EMBEDDING_DIMENSION, + database=db, + collection_name=ARANGO_COLLECTION_NAME, + embedding_field=ARANGO_EMBEDDING_FIELD, + text_field=ARANGO_TEXT_FIELD, + distance_strategy=ARANGO_DISTANCE_STRATEGY, + num_centroids=ARANGO_NUM_CENTROIDS, + ) + + if input.search_type == "similarity_score_threshold": + docs_and_similarities = await vector_db.asimilarity_search_with_relevance_scores( + query=query, + embedding=embedding, + k=input.k, + score_threshold=input.score_threshold, + use_approx=ARANGO_USE_APPROX_SEARCH, + ) + search_res = [doc for doc, _ in docs_and_similarities] + elif input.search_type == "mmr": + search_res = await vector_db.amax_marginal_relevance_search( + query=query, + embedding=embedding, + k=input.k, + fetch_k=input.fetch_k, + lambda_mult=input.lambda_mult, + use_approx=ARANGO_USE_APPROX_SEARCH, + ) + else: + # Default to basic similarity search + search_res = await vector_db.asimilarity_search( + query=query, + embedding=embedding, + k=input.k, + use_approx=ARANGO_USE_APPROX_SEARCH, + ) + + neighborhoods = {} + if ARANGO_TRAVERSAL_GRAPH_NAME: + fetch_neighborhoods( + vector_db, + neighborhoods, + [r.id for r in search_res], + ARANGO_TRAVERSAL_GRAPH_NAME, + ARANGO_COLLECTION_NAME, + ARANGO_TRAVERSAL_MAX_DEPTH, + ) + + # return different response format + retrieved_docs: Union[list[ArangoTextDoc], list[ArangoRetrievalResponseData]] = [] + if isinstance(input, EmbedDoc): + for r in search_res: + retrieved_docs.append( + ArangoTextDoc( + text=r.page_content, + id=r.id, + neighborhood=neighborhoods.get(r.id), + ) + ) + + result = SearchedDoc(retrieved_docs=retrieved_docs, initial_query=input.text) + + else: + for r in search_res: + retrieved_docs.append( + ArangoRetrievalResponseData( + text=r.page_content, + id=r.id, + metadata=r.metadata, + neighborhood=neighborhoods.get(r.id), + ) + ) + + if isinstance(input, RetrievalRequest): + result = RetrievalResponse(retrieved_docs=retrieved_docs) + + elif isinstance(input, ChatCompletionRequest): + input.retrieved_docs = retrieved_docs + input.documents = [doc.text for doc in retrieved_docs] + result = input + else: + raise ValueError("Invalid input type: ", type(input)) + + statistics_dict["opea_service@retriever_arango"].append_latency(time.time() - start, None) + + if logflag: + logger.info(result) + + return result + + +if __name__ == "__main__": + + if not ARANGO_EMBEDDING_DIMENSION: + raise ValueError("EMBED_DIMENSION must specified in advance.") + + if OPENAI_API_KEY and OPENAI_EMBED_MODEL: + # Use OpenAI embeddings + embeddings = OpenAIEmbeddings(model=OPENAI_EMBED_MODEL, dimensions=ARANGO_EMBEDDING_DIMENSION) + elif TEI_EMBEDDING_ENDPOINT and HUGGINGFACEHUB_API_TOKEN: + # create embeddings using TEI endpoint service + embeddings = HuggingFaceHubEmbeddings(model=TEI_EMBEDDING_ENDPOINT, huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN) + else: + # create embeddings using local embedding model + embeddings = HuggingFaceBgeEmbeddings(model_name=TEI_EMBED_MODEL) + + client = ArangoClient(hosts=ARANGO_URL) + sys_db = client.db(name="_system", username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + if not sys_db.has_database(ARANGO_DB_NAME): + sys_db.create_database(ARANGO_DB_NAME) + + db = client.db(name=ARANGO_DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + opea_microservices["opea_service@retriever_arango"].start() diff --git a/tests/chathistory/test_chathistory_arango.sh b/tests/chathistory/test_chathistory_arango.sh index f9d731802..9743a9ac4 100644 --- a/tests/chathistory/test_chathistory_arango.sh +++ b/tests/chathistory/test_chathistory_arango.sh @@ -16,7 +16,7 @@ export ARANGO_COLLECTION_NAME=${ARANGO_COLLECTION_NAME:-"test"} function build_docker_images() { cd $WORKPATH echo $(pwd) - docker run -d -p 8529:8529 --name=test-comps-arango arangodb/arangodb:latest + docker run -d -p 8529:8529 --name=test-comps-arango -e ARANGO_ROOT_PASSWORD=$ARANGO_PASSWORD arangodb/arangodb:latest docker build --no-cache -t opea/chathistory-arango-server:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/chathistory/arango/Dockerfile . if [ $? -ne 0 ]; then diff --git a/tests/dataprep/test_dataprep_arango_langchain.sh b/tests/dataprep/test_dataprep_arango_langchain.sh new file mode 100644 index 000000000..45d82b974 --- /dev/null +++ b/tests/dataprep/test_dataprep_arango_langchain.sh @@ -0,0 +1,105 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -x + +WORKPATH=$(dirname "$PWD") +LOG_PATH="$WORKPATH/tests" +ip_address=$(hostname -I | awk '{print $1}') + +export ARANGO_URL=${ARANGO_URL:-"http://${ip_address}:8529"} +export ARANGO_USERNAME=${ARANGO_USERNAME:-"root"} +export ARANGO_PASSWORD=${ARANGO_PASSWORD:-"test"} +export ARANGO_DB_NAME=${ARANGO_DB_NAME:-"_system"} + +function build_docker_images() { + cd $WORKPATH + echo $(pwd) + docker run -d -p 8529:8529 --name=test-comps-arango -e ARANGO_ROOT_PASSWORD=$ARANGO_PASSWORD arangodb/arangodb:latest + sleep 1m + + docker build --no-cache -t opea/dataprep-arango:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f $WORKPATH/comps/dataprep/arango/langchain/Dockerfile . + if [ $? -ne 0 ]; then + echo "opea/dataprep-arango built fail" + exit 1 + else + echo "opea/dataprep-arango built successful" + fi +} + +function start_service() { + tgi_endpoint=5044 + # Remember to set HF_TOKEN before invoking this test! + export HUGGINGFACEHUB_API_TOKEN=${HF_TOKEN} + model=Intel/neural-chat-7b-v3-3 + docker run -d --name="test-comps-dataprep-tgi-endpoint" -p $tgi_endpoint:80 -v ./data:/data --shm-size 1g ghcr.io/huggingface/text-generation-inference:1.4 --model-id $model + export TGI_LLM_ENDPOINT="http://${ip_address}:${tgi_endpoint}" + + # unset http_proxy + export no_proxy="localhost,127.0.0.1,"${ip_address} + docker run -d --name="test-comps-dataprep-arango-server" \ + -p 6007:6007 \ + --ipc=host \ + -e http_proxy=$http_proxy \ + -e https_proxy=$https_proxy \ + -e ARANGO_URL=$ARANGO_URL \ + -e ARANGO_USERNAME=$ARANGO_USERNAME \ + -e ARANGO_PASSWORD=$ARANGO_PASSWORD \ + -e ARANGO_DB_NAME=$ARANGO_DB_NAME \ + -e TGI_LLM_ENDPOINT=$TGI_LLM_ENDPOINT \ + opea/dataprep-arango:comps + + sleep 1m +} + + +function validate_microservice() { + cd $LOG_PATH + + # test /v1/dataprep + URL="http://${ip_address}:6007/v1/dataprep" + echo "Deep learning is a subset of machine learning that utilizes neural networks with multiple layers to analyze various levels of abstract data representations. It enables computers to identify patterns and make decisions with minimal human intervention by learning from large amounts of data." > $LOG_PATH/dataprep_file.txt + HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -F 'files=@./dataprep_file.txt' -H 'Content-Type: multipart/form-data' "$URL") + if [ "$HTTP_STATUS" -eq 200 ]; then + echo "[ dataprep ] HTTP status is 200. Checking content..." + cp ./dataprep_file.txt ./dataprep_file2.txt + local CONTENT=$(curl -s -X POST -F 'files=@./dataprep_file2.txt' -H 'Content-Type: multipart/form-data' "$URL" | tee ${LOG_PATH}/dataprep.log) + + if echo "$CONTENT" | grep -q "Data preparation succeeded"; then + echo "[ dataprep ] Content is as expected." + else + echo "[ dataprep ] Content does not match the expected result: $CONTENT" + docker logs test-comps-dataprep-arango >> ${LOG_PATH}/dataprep.log + exit 1 + fi + else + echo "[ dataprep ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs test-comps-dataprep-arango >> ${LOG_PATH}/dataprep.log + exit 1 + fi +} + +function stop_docker() { + cid=$(docker ps -aq --filter "name=test-comps-arango*") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi + + cid=$(docker ps -aq --filter "name=test-comps-dataprep-arango*") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + echo y | docker system prune + +} + +main diff --git a/tests/feedback_management/test_feedback_management_arango.sh b/tests/feedback_management/test_feedback_management_arango.sh index 6bbd32598..2eec9360e 100644 --- a/tests/feedback_management/test_feedback_management_arango.sh +++ b/tests/feedback_management/test_feedback_management_arango.sh @@ -16,7 +16,7 @@ export ARANGO_COLLECTION_NAME=${ARANGO_COLLECTION_NAME:-"test"} function build_docker_images() { cd $WORKPATH echo $(pwd) - docker run -d -p 8529:8529 --name=test-comps-arango arangodb/arangodb:latest + docker run -d -p 8529:8529 --name=test-comps-arango -e ARANGO_ROOT_PASSWORD=$ARANGO_PASSWORD arangodb/arangodb:latest docker build --no-cache -t opea/feedbackmanagement-arango-server:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/feedback_management/arango/Dockerfile . if [ $? -ne 0 ]; then diff --git a/tests/prompt_registry/test_prompt_registry_arango.sh b/tests/prompt_registry/test_prompt_registry_arango.sh index 16d81b17d..2338322a0 100644 --- a/tests/prompt_registry/test_prompt_registry_arango.sh +++ b/tests/prompt_registry/test_prompt_registry_arango.sh @@ -16,7 +16,7 @@ export ARANGO_COLLECTION_NAME=${ARANGO_COLLECTION_NAME:-"test"} function build_docker_images() { cd $WORKPATH echo $(pwd) - docker run -d -p 8529:8529 --name=test-comps-arango arangodb/arangodb:latest + docker run -d -p 8529:8529 --name=test-comps-arango -e ARANGO_ROOT_PASSWORD=$ARANGO_PASSWORD arangodb/arangodb:latest docker build --no-cache -t opea/promptregistry-arango-server:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/prompt_registry/arango/Dockerfile . if [ $? -ne 0 ]; then diff --git a/tests/retrievers/test_retrievers_arango_langchain.sh b/tests/retrievers/test_retrievers_arango_langchain.sh new file mode 100644 index 000000000..1b0ec6251 --- /dev/null +++ b/tests/retrievers/test_retrievers_arango_langchain.sh @@ -0,0 +1,126 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -x + +WORKPATH=$(dirname "$PWD") +LOG_PATH="$WORKPATH/tests" +ip_address=$(hostname -I | awk '{print $1}') + +export ARANGO_URL=${ARANGO_URL:-"http://${ip_address}:8529"} +export ARANGO_USERNAME=${ARANGO_USERNAME:-"root"} +export ARANGO_PASSWORD=${ARANGO_PASSWORD:-"test"} +export ARANGO_DB_NAME=${ARANGO_DB_NAME:-"_system"} +export ARANGO_COLLECTION_NAME=${ARANGO_COLLECTION_NAME:-"test"} +export ARANGO_EMBEDDING_DIMENSION=${ARANGO_EMBEDDING_DIMENSION:-5} + +function build_docker_images() { + cd $WORKPATH + echo $(pwd) + docker run -d -p 8529:8529 --name=test-comps-arango -e ARANGO_ROOT_PASSWORD=$ARANGO_PASSWORD arangodb/arangodb:latest + sleep 1m + + # Create ARANGO_COLLECTION_NAME + curl -X POST --header 'accept: application/json' \ + --header 'Content-Type: application/json' \ + --data '{"name": "'${ARANGO_COLLECTION_NAME}'", "type": 2, "waitForSync": true}' \ + "${ARANGO_URL}/_db/${ARANGO_DB_NAME}/_api/collection" \ + -u ${ARANGO_USERNAME}:${ARANGO_PASSWORD} + + # Insert data into arango: {text: "test", embedding: [0.1, 0.2, 0.3, 0.4, 0.5]} + curl -X POST --header 'accept: application/json' \ + --header 'Content-Type: application/json' \ + --data '{"text": "test", "embedding": [0.1, 0.2, 0.3, 0.4, 0.5]}' \ + "${ARANGO_URL}/_db/${ARANGO_DB_NAME}/_api/document/${ARANGO_COLLECTION_NAME}" \ + -u ${ARANGO_USERNAME}:${ARANGO_PASSWORD} + + docker build --no-cache -t opea/retriever-arango:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/arango/langchain/Dockerfile . + if [ $? -ne 0 ]; then + echo "opea/retriever-arango built fail" + exit 1 + else + echo "opea/retriever-arango built successful" + fi +} + +function start_service() { + # tei endpoint + tei_endpoint=5434 + model="BAAI/bge-base-en-v1.5" + docker run -d --name="test-comps-retriever-arango-tei-endpoint" -p $tei_endpoint:80 -v ./data:/data --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 --model-id $model + sleep 30s + export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:${tei_endpoint}" + + # unset http_proxy + export no_proxy="localhost,127.0.0.1,"${ip_address} + docker run -d --name="test-comps-retriever-arango-server" \ + -p 7000:7000 \ + --ipc=host \ + -e http_proxy=$http_proxy \ + -e https_proxy=$https_proxy \ + -e ARANGO_URL=$ARANGO_URL \ + -e ARANGO_USERNAME=$ARANGO_USERNAME \ + -e ARANGO_PASSWORD=$ARANGO_PASSWORD \ + -e ARANGO_DB_NAME=$ARANGO_DB_NAME \ + -e ARANGO_COLLECTION_NAME=$ARANGO_COLLECTION_NAME \ + -e ARANGO_EMBEDDING_DIMENSION=$ARANGO_EMBEDDING_DIMENSION \ + -e TEI_EMBEDDING_ENDPOINT=$TEI_EMBEDDING_ENDPOINT \ + opea/retriever-arango:comps + + sleep 1m +} + +function validate_microservice() { + export PATH="${HOME}/miniforge3/bin:$PATH" + source activate + URL="http://${ip_address}:7000/v1/retrieval" + + test_embedding="[0.1, 0.2, 0.3, 0.4, 0.5]" + HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "{\"text\":\"test\",\"embedding\":${test_embedding}}" -H 'Content-Type: application/json' "$URL") + if [ "$HTTP_STATUS" -eq 200 ]; then + echo "[ retriever ] HTTP status is 200. Checking content..." + local CONTENT=$(curl -s -X POST -d "{\"text\":\"test\",\"embedding\":${test_embedding}}" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/retriever.log) + + if echo "$CONTENT" | grep -q "retrieved_docs"; then + echo "[ retriever ] Content is as expected." + else + echo "[ retriever ] Content does not match the expected result: $CONTENT" + docker logs test-comps-retriever-arango-server >> ${LOG_PATH}/retriever.log + docker logs test-comps-retriever-arango-tei-endpoint >> ${LOG_PATH}/tei.log + exit 1 + fi + else + echo "[ retriever ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs test-comps-retriever-arango-server >> ${LOG_PATH}/retriever.log + docker logs test-comps-retriever-arango-tei-endpoint >> ${LOG_PATH}/tei.log + exit 1 + fi +} + +function stop_docker() { + cid_retrievers=$(docker ps -aq --filter "name=test-comps-retriever-arango*") + if [[ ! -z "$cid_retrievers" ]]; then + docker stop $cid_retrievers && docker rm $cid_retrievers && sleep 1s + fi + cid_db=$(docker ps -aq --filter "name=test-comps-arango-apoc1") + if [[ ! -z "$cid_retrievers" ]]; then + docker stop $cid_retrievers && docker rm $cid_retrievers && sleep 1s + fi +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + echo y | docker system prune + +} + +main From 9a7d8104c9104fa1cf40e7e7ec90417da8570d83 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Sun, 5 Jan 2025 17:27:17 -0500 Subject: [PATCH 20/22] revert: feedback management mongo change --- .../workflows/docker/compose/feedback_management-compose.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docker/compose/feedback_management-compose.yaml b/.github/workflows/docker/compose/feedback_management-compose.yaml index 51f5ae343..c22e6e496 100644 --- a/.github/workflows/docker/compose/feedback_management-compose.yaml +++ b/.github/workflows/docker/compose/feedback_management-compose.yaml @@ -3,10 +3,10 @@ # this file should be run in the root of the repo services: - feedbackmanagement-mongo-server: + feedbackmanagement: build: dockerfile: comps/feedback_management/mongo/Dockerfile - image: ${REGISTRY:-opea}/feedbackmanagement-mongo-server:${TAG:-latest} + image: ${REGISTRY:-opea}/feedbackmanagement:${TAG:-latest} feedbackmanagement-arango-server: build: dockerfile: comps/feedback_management/arango/Dockerfile From f0659ade62921ad2a0e184fc65e43034e1346daa Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Mon, 6 Jan 2025 17:28:45 -0500 Subject: [PATCH 21/22] add logs --- comps/dataprep/arango/langchain/prepare_doc_arango.py | 4 +++- comps/retrievers/arango/langchain/retriever_arango.py | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/comps/dataprep/arango/langchain/prepare_doc_arango.py b/comps/dataprep/arango/langchain/prepare_doc_arango.py index a89984f08..babc29131 100644 --- a/comps/dataprep/arango/langchain/prepare_doc_arango.py +++ b/comps/dataprep/arango/langchain/prepare_doc_arango.py @@ -302,7 +302,7 @@ async def ingest_documents( except (TypeError, ValueError) as e: if logflag: logger.error(f"Failed to initialize LLMGraphTransformer: {e}") - raise + raise e ######################################## # Text Embeddings Inference (optional) # @@ -340,5 +340,7 @@ async def ingest_documents( sys_db.create_database(ARANGO_DB_NAME) db = client.db(name=ARANGO_DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + if logflag: + logger.info(f"Connected to ArangoDB {db.version()}.") opea_microservices["opea_service@prepare_doc_arango"].start() diff --git a/comps/retrievers/arango/langchain/retriever_arango.py b/comps/retrievers/arango/langchain/retriever_arango.py index 2a7b37412..fb9a0e095 100644 --- a/comps/retrievers/arango/langchain/retriever_arango.py +++ b/comps/retrievers/arango/langchain/retriever_arango.py @@ -245,5 +245,7 @@ async def retrieve( sys_db.create_database(ARANGO_DB_NAME) db = client.db(name=ARANGO_DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + if logflag: + logger.info(f"Connected to ArangoDB {db.version()}.") opea_microservices["opea_service@retriever_arango"].start() From aa2601322e2b16162bde961b66ac0ad36beccd39 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna <43019056+aMahanna@users.noreply.github.com> Date: Wed, 8 Jan 2025 18:06:24 -0500 Subject: [PATCH 22/22] dataprep improvements (#16) * dataprep improvements * fix: readme * new: make embedding generation mandatory * fix: exception handling * add logs * new: `ARANGO_USE_GRAPH_NAME` --- comps/dataprep/arango/langchain/README.md | 3 +- comps/dataprep/arango/langchain/config.py | 9 +- .../docker-compose-dataprep-arango.yaml | 17 +++ .../arango/langchain/prepare_doc_arango.py | 110 +++++++++++------- 4 files changed, 90 insertions(+), 49 deletions(-) diff --git a/comps/dataprep/arango/langchain/README.md b/comps/dataprep/arango/langchain/README.md index fb383b42e..5c6985e67 100644 --- a/comps/dataprep/arango/langchain/README.md +++ b/comps/dataprep/arango/langchain/README.md @@ -116,9 +116,10 @@ ArangoDB Connection configuration - `ARANGO_DB_NAME`: The name of the database to use for the ArangoDB service. ArangoDB Graph Insertion configuration -- `USE_ONE_ENTITY_COLLECTION`: If set to True, the microservice will use a single entity collection for all nodes. If set to False, the microservice will use a separate collection by node type. Defaults to `True`. - `INSERT_ASYNC`: If set to True, the microservice will insert the data into ArangoDB asynchronously. Defaults to `False`. - `ARANGO_BATCH_SIZE`: The batch size for the microservice to insert the data. Defaults to `500`. +- `ARANGO_GRAPH_NAME`: The name of the graph to use/create in ArangoDB Defaults to `GRAPH`. +- `ARANGO_USE_GRAPH_NAME`: If set to True, the microservice will use the graph name specified in the environment variable `ARANGO_GRAPH_NAME`. If set to False, the file name will be used as the graph name. Defaults to `True`. Text Generation Inference Configuration - `TGI_LLM_ENDPOINT`: The endpoint for the TGI service. diff --git a/comps/dataprep/arango/langchain/config.py b/comps/dataprep/arango/langchain/config.py index c3caf2da2..98bfd498b 100644 --- a/comps/dataprep/arango/langchain/config.py +++ b/comps/dataprep/arango/langchain/config.py @@ -10,9 +10,10 @@ ARANGO_DB_NAME = os.getenv("ARANGO_DB_NAME", "_system") # ArangoDB Graph Insertion configuration -USE_ONE_ENTITY_COLLECTION = os.getenv("USE_ONE_ENTITY_COLLECTION", True) INSERT_ASYNC = os.getenv("INSERT_ASYNC", False) ARANGO_BATCH_SIZE = os.getenv("ARANGO_BATCH_SIZE", 500) +ARANGO_GRAPH_NAME = os.getenv("ARANGO_GRAPH_NAME", "GRAPH") +ARANGO_USE_GRAPH_NAME = os.getenv("ARANGO_USE_GRAPH_NAME", True) # Text Generation Inference configuration TGI_LLM_ENDPOINT = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080") @@ -29,10 +30,12 @@ # OpenAI configuration (alternative to TGI & TEI) OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") -OPENAI_EMBED_MODEL = os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small") -OPENAI_EMBED_DIMENSIONS = os.getenv("OPENAI_EMBED_DIMENSIONS", 512) OPENAI_CHAT_MODEL = os.getenv("OPENAI_CHAT_MODEL", "gpt-4o") OPENAI_CHAT_TEMPERATURE = os.getenv("OPENAI_CHAT_TEMPERATURE", 0) +OPENAI_EMBED_MODEL = os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small") +OPENAI_EMBED_DIMENSIONS = os.getenv("OPENAI_EMBED_DIMENSIONS", 512) +OPENAI_CHAT_ENABLED = os.getenv("OPENAI_TEI_ENABLED", True) +OPENAI_EMBED_ENABLED = os.getenv("OPENAI_TGI_ENABLED", True) # LLMGraphTransformer configuration SYSTEM_PROMPT_PATH = os.getenv("SYSTEM_PROMPT_PATH") diff --git a/comps/dataprep/arango/langchain/docker-compose-dataprep-arango.yaml b/comps/dataprep/arango/langchain/docker-compose-dataprep-arango.yaml index d3a9882c6..207e3d730 100644 --- a/comps/dataprep/arango/langchain/docker-compose-dataprep-arango.yaml +++ b/comps/dataprep/arango/langchain/docker-compose-dataprep-arango.yaml @@ -10,6 +10,22 @@ services: - "8529:8529" environment: ARANGO_ROOT_PASSWORD: ${ARANGO_PASSWORD} + tei-embedding-service: + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + container_name: tei-embedding-server + ports: + - "6006:80" + volumes: + - "./data:/data" + shm_size: 1g + environment: + no_proxy: ${no_proxy} + NO_PROXY: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + ipc: host + command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate tgi_gaudi_service: image: ghcr.io/huggingface/tgi-gaudi:2.0.5 container_name: tgi-service @@ -30,6 +46,7 @@ services: depends_on: - arango-vector-db - tgi_gaudi_service + - tei-embedding-service ports: - "6007:6007" ipc: host diff --git a/comps/dataprep/arango/langchain/prepare_doc_arango.py b/comps/dataprep/arango/langchain/prepare_doc_arango.py index babc29131..6a90c2d04 100644 --- a/comps/dataprep/arango/langchain/prepare_doc_arango.py +++ b/comps/dataprep/arango/langchain/prepare_doc_arango.py @@ -12,6 +12,7 @@ ALLOWED_RELATIONSHIPS, ARANGO_BATCH_SIZE, ARANGO_DB_NAME, + ARANGO_GRAPH_NAME, ARANGO_PASSWORD, ARANGO_URL, ARANGO_USERNAME, @@ -33,7 +34,9 @@ TGI_LLM_TIMEOUT, TGI_LLM_TOP_K, TGI_LLM_TOP_P, - USE_ONE_ENTITY_COLLECTION, + OPENAI_CHAT_ENABLED, + OPENAI_EMBED_ENABLED, + ARANGO_USE_GRAPH_NAME, ) from fastapi import File, Form, HTTPException, UploadFile from langchain.text_splitter import RecursiveCharacterTextSplitter @@ -86,7 +89,7 @@ logger.error(f"Could not set custom Prompt: {e}") -def ingest_data_to_arango(doc_path: DocPath, graph_name: str, generate_chunk_embeddings: bool) -> bool: +def ingest_data_to_arango(doc_path: DocPath) -> str: """Ingest document to ArangoDB.""" path = doc_path.path @@ -128,7 +131,7 @@ def ingest_data_to_arango(doc_path: DocPath, graph_name: str, generate_chunk_emb chunks = chunks + table_chunks if logflag: - logger.info(f"Done preprocessing. Created {len(chunks)} chunks of the original file.") + logger.info(f"Created {len(chunks)} chunks of the original file.") ################################ # Graph generation & insertion # @@ -140,29 +143,40 @@ def ingest_data_to_arango(doc_path: DocPath, graph_name: str, generate_chunk_emb generate_schema_on_init=False, ) - for text in chunks: + if ARANGO_USE_GRAPH_NAME: + graph_name = ARANGO_GRAPH_NAME + else: + file_name = os.path.basename(path).split(".")[0] + graph_name = "".join(c for c in file_name if c.isalnum() or c in "_-:.@()+,=;$!*'%") + + if logflag: + logger.info(f"Creating graph {graph_name}.") + + for i, text in enumerate(chunks): document = Document(page_content=text) graph_doc = llm_transformer.process_response(document) - if generate_chunk_embeddings: - source = graph_doc.source - source.metadata["embedding"] = embeddings.embed_documents([source.page_content])[0] + source = graph_doc.source + source.metadata["embedding"] = embeddings.embed_documents([source.page_content])[0] graph.add_graph_documents( graph_documents=[graph_doc], include_source=True, graph_name=graph_name, - update_graph_definition_if_exists=not USE_ONE_ENTITY_COLLECTION, + update_graph_definition_if_exists=False, batch_size=ARANGO_BATCH_SIZE, - use_one_entity_collection=USE_ONE_ENTITY_COLLECTION, + use_one_entity_collection=True, insert_async=INSERT_ASYNC, source_metadata_fields_to_extract_to_top_level={"embedding"}, ) + if logflag: + logger.info(f"Chunk {i} processed into graph.") + if logflag: logger.info("The graph is built.") - return True + return graph_name @register_microservice( @@ -180,13 +194,16 @@ async def ingest_documents( chunk_overlap: int = Form(100), process_table: bool = Form(False), table_strategy: str = Form("fast"), - graph_name: str = Form("Graph"), - create_embeddings: bool = Form(True), ): if logflag: logger.info(f"files:{files}") logger.info(f"link_list:{link_list}") + if not files and not link_list: + raise HTTPException(status_code=400, detail="Must provide either a file or a string list.") + + graph_names_created = set() + if files: if not isinstance(files, list): files = [files] @@ -195,24 +212,24 @@ async def ingest_documents( encode_file = encode_filename(file.filename) save_path = upload_folder + encode_file await save_content_to_local_disk(save_path, file) - ingest_data_to_arango( - DocPath( - path=save_path, - chunk_size=chunk_size, - chunk_overlap=chunk_overlap, - process_table=process_table, - table_strategy=table_strategy, - ), - graph_name=graph_name, - generate_chunk_embeddings=create_embeddings and embeddings is not None, - ) - uploaded_files.append(save_path) + try: + graph_name = ingest_data_to_arango( + DocPath( + path=save_path, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + process_table=process_table, + table_strategy=table_strategy, + ), + ) + + uploaded_files.append(save_path) + graph_names_created.add(graph_name) + except Exception as e: + raise HTTPException(status_code=500, detail=f"Failed to ingest {save_path} into ArangoDB: {e}") + if logflag: logger.info(f"Successfully saved file {save_path}") - result = {"status": 200, "message": "Data preparation succeeded"} - if logflag: - logger.info(result) - return result if link_list: link_list = json.loads(link_list) # Parse JSON string to list @@ -222,9 +239,9 @@ async def ingest_documents( encoded_link = encode_filename(link) save_path = upload_folder + encoded_link + ".txt" content = parse_html([link])[0][0] + await save_content_to_local_disk(save_path, content) try: - await save_content_to_local_disk(save_path, content) - ingest_data_to_arango( + graph_name = ingest_data_to_arango( DocPath( path=save_path, chunk_size=chunk_size, @@ -232,21 +249,26 @@ async def ingest_documents( process_table=process_table, table_strategy=table_strategy, ), - graph_name=graph_name, - generate_chunk_embeddings=create_embeddings and embeddings is not None, ) - except json.JSONDecodeError: - raise HTTPException(status_code=500, detail="Fail to ingest data into qdrant.") + graph_names_created.add(graph_name) + except Exception as e: + raise HTTPException(status_code=500, detail=f"Failed to ingest {save_path} into ArangoDB: {e}") if logflag: logger.info(f"Successfully saved link {link}") - result = {"status": 200, "message": "Data preparation succeeded"} - if logflag: - logger.info(result) - return result + graph_names_created = list(graph_names_created) + + result = { + "status": 200, + "message": f"Data preparation succeeded: {graph_names_created}", + "graph_names": graph_names_created, + } - raise HTTPException(status_code=400, detail="Must provide either a file or a string list.") + if logflag: + logger.info(result) + + return result if __name__ == "__main__": @@ -255,7 +277,7 @@ async def ingest_documents( # Text Generation Inference # ############################# - if OPENAI_API_KEY: + if OPENAI_API_KEY and OPENAI_CHAT_ENABLED: if logflag: logger.info("OpenAI API Key is set. Verifying its validity...") openai.api_key = OPENAI_API_KEY @@ -282,7 +304,7 @@ async def ingest_documents( timeout=TGI_LLM_TIMEOUT, ) else: - raise ValueError("No text generation inference endpoint is set.") + raise ValueError("No text generation environment variables are set, cannot generate graphs.") try: llm_transformer = LLMGraphTransformer( @@ -308,7 +330,7 @@ async def ingest_documents( # Text Embeddings Inference (optional) # ######################################## - if OPENAI_API_KEY: + if OPENAI_API_KEY and OPENAI_EMBED_ENABLED: # Use OpenAI embeddings embeddings = OpenAIEmbeddings( model=OPENAI_EMBED_MODEL, @@ -325,9 +347,7 @@ async def ingest_documents( # Use local embedding model embeddings = HuggingFaceBgeEmbeddings(model_name=TEI_EMBED_MODEL) else: - if logflag: - logger.warning("No embeddings environment variables are set, cannot generate embeddings.") - embeddings = None + raise ValueError("No embeddings environment variables are set, cannot generate embeddings.") ############ # ArangoDB #