diff --git a/EdgeCraftRAG/Dockerfile b/EdgeCraftRAG/Dockerfile index 3c9711dea..2e6191a01 100644 --- a/EdgeCraftRAG/Dockerfile +++ b/EdgeCraftRAG/Dockerfile @@ -13,13 +13,11 @@ RUN useradd -m -s /bin/bash user && \ mkdir -p /home/user && \ chown -R user /home/user/ -COPY ./edgecraftrag /home/user/edgecraftrag +COPY ./requirements.txt /home/user/requirements.txt COPY ./chatqna.py /home/user/chatqna.py -WORKDIR /home/user/edgecraftrag -RUN pip install --no-cache-dir -r requirements.txt - WORKDIR /home/user +RUN pip install --no-cache-dir -r requirements.txt USER user diff --git a/EdgeCraftRAG/Dockerfile.server b/EdgeCraftRAG/Dockerfile.server index c04dc0a54..f076dcd16 100644 --- a/EdgeCraftRAG/Dockerfile.server +++ b/EdgeCraftRAG/Dockerfile.server @@ -25,6 +25,9 @@ RUN useradd -m -s /bin/bash user && \ COPY ./edgecraftrag /home/user/edgecraftrag +RUN mkdir -p /home/user/gradio_cache +ENV GRADIO_TEMP_DIR=/home/user/gradio_cache + WORKDIR /home/user/edgecraftrag RUN pip install --no-cache-dir -r requirements.txt diff --git a/EdgeCraftRAG/README.md b/EdgeCraftRAG/README.md index da8d2efb0..a24822532 100644 --- a/EdgeCraftRAG/README.md +++ b/EdgeCraftRAG/README.md @@ -7,39 +7,112 @@ quality and performance. ## Quick Start Guide -### Run Containers with Docker Compose +### (Optional) Build Docker Images for Mega Service, Server and UI by your own + +If you want to build the images by your own, please follow the steps: + +```bash +cd GenAIExamples/EdgeCraftRAG + +docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy --build-arg no_proxy=$no_proxy -t opea/edgecraftrag:latest -f Dockerfile . +docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy --build-arg no_proxy=$no_proxy -t opea/edgecraftrag-server:latest -f Dockerfile.server . +docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy --build-arg no_proxy=$no_proxy -t opea/edgecraftrag-ui:latest -f ui/docker/Dockerfile.ui . +``` + +### Using Intel Arc GPU + +#### Local inference with OpenVINO for Intel Arc GPU + +You can select "local" type in generation field which is the default approach to enable Intel Arc GPU for LLM. You don't need to build images for "local" type. + +#### vLLM with OpenVINO for Intel Arc GPU + +You can also select "vLLM" as generation type, to enable this type, you'll need to build the vLLM image for Intel Arc GPU before service bootstrap. +Please follow this link [vLLM with OpenVINO](https://github.com/opea-project/GenAIComps/tree/main/comps/llms/text-generation/vllm/langchain#build-docker-image) to build the vLLM image. + +### Start Edge Craft RAG Services with Docker Compose + +If you want to enable vLLM with OpenVINO service, please finish the steps in [Launch vLLM with OpenVINO service](#optional-launch-vllm-with-openvino-service) first. ```bash cd GenAIExamples/EdgeCraftRAG/docker_compose/intel/gpu/arc export MODEL_PATH="your model path for all your models" export DOC_PATH="your doc path for uploading a dir of files" +export GRADIO_PATH="your gradio cache path for transferring files" + +# Make sure all 3 folders have 1000:1000 permission, otherwise +# chown 1000:1000 ${MODEL_PATH} ${DOC_PATH} ${GRADIO_PATH} + +# Use `ip a` to check your active ip export HOST_IP="your host ip" -export UI_SERVICE_PORT="port for UI service" -# Optional for vllm endpoint -export vLLM_ENDPOINT="http://${HOST_IP}:8008" +# Check group id of video and render +export VIDEOGROUPID=$(getent group video | cut -d: -f3) +export RENDERGROUPID=$(getent group render | cut -d: -f3) # If you have a proxy configured, uncomment below line -# export no_proxy=$no_proxy,${HOST_IP},edgecraftrag,edgecraftrag-server +# export no_proxy=${no_proxy},${HOST_IP},edgecraftrag,edgecraftrag-server +# export NO_PROXY=${NO_PROXY},${HOST_IP},edgecraftrag,edgecraftrag-server # If you have a HF mirror configured, it will be imported to the container # export HF_ENDPOINT="your HF mirror endpoint" # By default, the ports of the containers are set, uncomment if you want to change # export MEGA_SERVICE_PORT=16011 # export PIPELINE_SERVICE_PORT=16011 +# export UI_SERVICE_PORT="8082" + +# Prepare models for embedding, reranking and generation, you can also choose other OpenVINO optimized models +# Here is the example: +pip install --upgrade --upgrade-strategy eager "optimum[openvino]" + +optimum-cli export openvino -m BAAI/bge-small-en-v1.5 ${MODEL_PATH}/BAAI/bge-small-en-v1.5 --task sentence-similarity +optimum-cli export openvino -m BAAI/bge-reranker-large ${MODEL_PATH}/BAAI/bge-reranker-large --task sentence-similarity +optimum-cli export openvino -m Qwen/Qwen2-7B-Instruct ${MODEL_PATH}/Qwen/Qwen2-7B-Instruct/INT4_compressed_weights --weight-format int4 docker compose up -d + ``` -### (Optional) Build Docker Images for Mega Service, Server and UI by your own +#### (Optional) Launch vLLM with OpenVINO service + +1. Set up Environment Variables ```bash -cd GenAIExamples/EdgeCraftRAG +export LLM_MODEL=#your model id +export VLLM_SERVICE_PORT=8008 +export vLLM_ENDPOINT="http://${HOST_IP}:${VLLM_SERVICE_PORT}" +export HUGGINGFACEHUB_API_TOKEN=#your HF token +``` -docker build --build-arg http_proxy=$HTTP_PROXY --build-arg https_proxy=$HTTPS_PROXY --build-arg no_proxy=$NO_PROXY -t opea/edgecraftrag:latest -f Dockerfile . -docker build --build-arg http_proxy=$HTTP_PROXY --build-arg https_proxy=$HTTPS_PROXY --build-arg no_proxy=$NO_PROXY -t opea/edgecraftrag-server:latest -f Dockerfile.server . -docker build --build-arg http_proxy=$HTTP_PROXY --build-arg https_proxy=$HTTPS_PROXY --build-arg no_proxy=$NO_PROXY -t opea/edgecraftrag-ui:latest -f ui/docker/Dockerfile.ui . +2. Uncomment below code in 'GenAIExamples/EdgeCraftRAG/docker_compose/intel/gpu/arc/compose.yaml' + +```bash + # vllm-openvino-server: + # container_name: vllm-openvino-server + # image: opea/vllm-arc:latest + # ports: + # - ${VLLM_SERVICE_PORT:-8008}:80 + # environment: + # HTTPS_PROXY: ${https_proxy} + # HTTP_PROXY: ${https_proxy} + # VLLM_OPENVINO_DEVICE: GPU + # HF_ENDPOINT: ${HF_ENDPOINT} + # HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + # volumes: + # - /dev/dri/by-path:/dev/dri/by-path + # - $HOME/.cache/huggingface:/root/.cache/huggingface + # devices: + # - /dev/dri + # entrypoint: /bin/bash -c "\ + # cd / && \ + # export VLLM_CPU_KVCACHE_SPACE=50 && \ + # export VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON && \ + # python3 -m vllm.entrypoints.openai.api_server \ + # --model '${LLM_MODEL}' \ + # --max_model_len=1024 \ + # --host 0.0.0.0 \ + # --port 80" ``` ### ChatQnA with LLM Example (Command Line) @@ -109,7 +182,7 @@ curl -X POST http://${HOST_IP}:16010/v1/settings/pipelines -H "Content-Type: app # } # Prepare data from local directory -curl -X POST http://${HOST_IP}:16010/v1/data -H "Content-Type: application/json" -d '{"local_path":"#REPLACE WITH YOUR LOCAL DOC DIR#"}' | jq '.' +curl -X POST http://${HOST_IP}:16010/v1/data -H "Content-Type: application/json" -d '{"local_path":"docs/#REPLACE WITH YOUR DIR WITHIN MOUNTED DOC PATH#"}' | jq '.' # Validate Mega Service curl -X POST http://${HOST_IP}:16011/v1/chatqna -H "Content-Type: application/json" -d '{"messages":"#REPLACE WITH YOUR QUESTION HERE#", "top_n":5, "max_tokens":512}' | jq '.' @@ -121,33 +194,14 @@ Open your browser, access http://${HOST_IP}:8082 > Your browser should be running on the same host of your console, otherwise you will need to access UI with your host domain name instead of ${HOST_IP}. -### (Optional) Launch vLLM with OpenVINO service +To create a default pipeline, you need to click the `Create Pipeline` button on the `RAG Settings` page. You can also create multiple pipelines or update existing pipelines through the `Pipeline Configuration`, but please note that active pipelines cannot be updated. +![create_pipeline](assets/img/create_pipeline.png) -```bash -# 1. export LLM_MODEL -export LLM_MODEL="your model id" -# 2. Uncomment below code in 'GenAIExamples/EdgeCraftRAG/docker_compose/intel/gpu/arc/compose.yaml' - # vllm-service: - # image: vllm:openvino - # container_name: vllm-openvino-server - # depends_on: - # - vllm-service - # ports: - # - "8008:80" - # environment: - # no_proxy: ${no_proxy} - # http_proxy: ${http_proxy} - # https_proxy: ${https_proxy} - # vLLM_ENDPOINT: ${vLLM_ENDPOINT} - # LLM_MODEL: ${LLM_MODEL} - # entrypoint: /bin/bash -c "\ - # cd / && \ - # export VLLM_CPU_KVCACHE_SPACE=50 && \ - # python3 -m vllm.entrypoints.openai.api_server \ - # --model '${LLM_MODEL}' \ - # --host 0.0.0.0 \ - # --port 80" -``` +After the pipeline creation, you can upload your data in the `Chatbot` page. +![upload_data](assets/img/upload_data.png) + +Then, you can submit messages in the chat box. +![chat_with_rag](assets/img/chat_with_rag.png) ## Advanced User Guide @@ -156,27 +210,13 @@ export LLM_MODEL="your model id" #### Create a pipeline ```bash -curl -X POST http://${HOST_IP}:16010/v1/settings/pipelines -H "Content-Type: application/json" -d @examples/test_pipeline.json | jq '.' -``` - -It will take some time to prepare the embedding model. - -#### Upload a text - -```bash -curl -X POST http://${HOST_IP}:16010/v1/data -H "Content-Type: application/json" -d @examples/test_data.json | jq '.' -``` - -#### Provide a query to retrieve context with similarity search. - -```bash -curl -X POST http://${HOST_IP}:16010/v1/retrieval -H "Content-Type: application/json" -d @examples/test_query.json | jq '.' +curl -X POST http://${HOST_IP}:16010/v1/settings/pipelines -H "Content-Type: application/json" -d @tests/test_pipeline_local_llm.json | jq '.' ``` -#### Create the second pipeline test2 +#### Update a pipeline ```bash -curl -X POST http://${HOST_IP}:16010/v1/settings/pipelines -H "Content-Type: application/json" -d @examples/test_pipeline2.json | jq '.' +curl -X PATCH http://${HOST_IP}:16010/v1/settings/pipelines -H "Content-Type: application/json" -d @tests/test_pipeline_local_llm.json | jq '.' ``` #### Check all pipelines @@ -185,19 +225,10 @@ curl -X POST http://${HOST_IP}:16010/v1/settings/pipelines -H "Content-Type: app curl -X GET http://${HOST_IP}:16010/v1/settings/pipelines -H "Content-Type: application/json" | jq '.' ``` -#### Compare similarity retrieval (test1) and keyword retrieval (test2) +#### Activate a pipeline ```bash -# Activate pipeline test1 curl -X PATCH http://${HOST_IP}:16010/v1/settings/pipelines/test1 -H "Content-Type: application/json" -d '{"active": "true"}' | jq '.' -# Similarity retrieval -curl -X POST http://${HOST_IP}:16010/v1/retrieval -H "Content-Type: application/json" -d '{"messages":"number"}' | jq '.' - -# Activate pipeline test2 -curl -X PATCH http://${HOST_IP}:16010/v1/settings/pipelines/test2 -H "Content-Type: application/json" -d '{"active": "true"}' | jq '.' -# Keyword retrieval -curl -X POST http://${HOST_IP}:16010/v1/retrieval -H "Content-Type: application/json" -d '{"messages":"number"}' | jq '.' - ``` ### Model Management @@ -205,7 +236,7 @@ curl -X POST http://${HOST_IP}:16010/v1/retrieval -H "Content-Type: application/ #### Load a model ```bash -curl -X POST http://${HOST_IP}:16010/v1/settings/models -H "Content-Type: application/json" -d @examples/test_model_load.json | jq '.' +curl -X POST http://${HOST_IP}:16010/v1/settings/models -H "Content-Type: application/json" -d '{"model_type": "reranker", "model_id": "BAAI/bge-reranker-large", "model_path": "./models/bge_ov_reranker", "device": "cpu"}' | jq '.' ``` It will take some time to load the model. @@ -219,7 +250,7 @@ curl -X GET http://${HOST_IP}:16010/v1/settings/models -H "Content-Type: applica #### Update a model ```bash -curl -X PATCH http://${HOST_IP}:16010/v1/settings/models/BAAI/bge-reranker-large -H "Content-Type: application/json" -d @examples/test_model_update.json | jq '.' +curl -X PATCH http://${HOST_IP}:16010/v1/settings/models/BAAI/bge-reranker-large -H "Content-Type: application/json" -d '{"model_type": "reranker", "model_id": "BAAI/bge-reranker-large", "model_path": "./models/bge_ov_reranker", "device": "gpu"}' | jq '.' ``` #### Check a certain model @@ -239,14 +270,14 @@ curl -X DELETE http://${HOST_IP}:16010/v1/settings/models/BAAI/bge-reranker-larg #### Add a text ```bash -curl -X POST http://${HOST_IP}:16010/v1/data -H "Content-Type: application/json" -d @examples/test_data.json | jq '.' +curl -X POST http://${HOST_IP}:16010/v1/data -H "Content-Type: application/json" -d '{"text":"#REPLACE WITH YOUR TEXT"}' | jq '.' ``` #### Add files from existed file path ```bash -curl -X POST http://${HOST_IP}:16010/v1/data -H "Content-Type: application/json" -d @examples/test_data_dir.json | jq '.' -curl -X POST http://${HOST_IP}:16010/v1/data -H "Content-Type: application/json" -d @examples/test_data_file.json | jq '.' +curl -X POST http://${HOST_IP}:16010/v1/data -H "Content-Type: application/json" -d '{"local_path":"docs/#REPLACE WITH YOUR DIR WITHIN MOUNTED DOC PATH#"}' | jq '.' +curl -X POST http://${HOST_IP}:16010/v1/data -H "Content-Type: application/json" -d '{"local_path":"docs/#REPLACE WITH YOUR FILE WITHIN MOUNTED DOC PATH#"}' | jq '.' ``` #### Check all files @@ -270,5 +301,5 @@ curl -X DELETE http://${HOST_IP}:16010/v1/data/files/test2.docx -H "Content-Type #### Update a file ```bash -curl -X PATCH http://${HOST_IP}:16010/v1/data/files/test.pdf -H "Content-Type: application/json" -d @examples/test_data_file.json | jq '.' +curl -X PATCH http://${HOST_IP}:16010/v1/data/files/test.pdf -H "Content-Type: application/json" -d '{"local_path":"docs/#REPLACE WITH YOUR FILE WITHIN MOUNTED DOC PATH#"}' | jq '.' ``` diff --git a/EdgeCraftRAG/assets/img/chat_with_rag.png b/EdgeCraftRAG/assets/img/chat_with_rag.png new file mode 100644 index 000000000..04000ef37 Binary files /dev/null and b/EdgeCraftRAG/assets/img/chat_with_rag.png differ diff --git a/EdgeCraftRAG/assets/img/create_pipeline.png b/EdgeCraftRAG/assets/img/create_pipeline.png new file mode 100644 index 000000000..53331b2b7 Binary files /dev/null and b/EdgeCraftRAG/assets/img/create_pipeline.png differ diff --git a/EdgeCraftRAG/assets/img/upload_data.png b/EdgeCraftRAG/assets/img/upload_data.png new file mode 100644 index 000000000..8fff43e68 Binary files /dev/null and b/EdgeCraftRAG/assets/img/upload_data.png differ diff --git a/EdgeCraftRAG/chatqna.py b/EdgeCraftRAG/chatqna.py index 1afa9621c..02f0a84dd 100644 --- a/EdgeCraftRAG/chatqna.py +++ b/EdgeCraftRAG/chatqna.py @@ -18,6 +18,7 @@ ChatMessage, UsageInfo, ) +from comps.cores.proto.docarray import LLMParams from fastapi import Request from fastapi.responses import StreamingResponse @@ -30,7 +31,20 @@ def __init__(self, megaservice, host="0.0.0.0", port=16011): async def handle_request(self, request: Request): input = await request.json() - result_dict, runtime_graph = await self.megaservice.schedule(initial_inputs=input) + stream_opt = input.get("stream", False) + chat_request = ChatCompletionRequest.parse_obj(input) + parameters = LLMParams( + max_tokens=chat_request.max_tokens if chat_request.max_tokens else 1024, + top_k=chat_request.top_k if chat_request.top_k else 10, + top_p=chat_request.top_p if chat_request.top_p else 0.95, + temperature=chat_request.temperature if chat_request.temperature else 0.01, + frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0, + presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0, + repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03, + streaming=stream_opt, + chat_template=chat_request.chat_template if chat_request.chat_template else None, + ) + result_dict, runtime_graph = await self.megaservice.schedule(initial_inputs=input, llm_parameters=parameters) for node, response in result_dict.items(): if isinstance(response, StreamingResponse): return response @@ -61,7 +75,7 @@ def add_remote_service(self): port=PIPELINE_SERVICE_PORT, endpoint="/v1/chatqna", use_remote_service=True, - service_type=ServiceType.UNDEFINED, + service_type=ServiceType.LLM, ) self.megaservice.add(edgecraftrag) self.gateway = EdgeCraftRagGateway(megaservice=self.megaservice, host="0.0.0.0", port=self.port) diff --git a/EdgeCraftRAG/docker_compose/intel/gpu/arc/compose.yaml b/EdgeCraftRAG/docker_compose/intel/gpu/arc/compose.yaml index f877b7c58..a695fbc02 100644 --- a/EdgeCraftRAG/docker_compose/intel/gpu/arc/compose.yaml +++ b/EdgeCraftRAG/docker_compose/intel/gpu/arc/compose.yaml @@ -14,12 +14,15 @@ services: volumes: - ${MODEL_PATH:-${PWD}}:/home/user/models - ${DOC_PATH:-${PWD}}:/home/user/docs + - ${GRADIO_PATH:-${PWD}}:/home/user/gradio_cache + - ${HF_CACHE:-${HOME}/.cache}:/home/user/.cache ports: - ${PIPELINE_SERVICE_PORT:-16010}:${PIPELINE_SERVICE_PORT:-16010} devices: - /dev/dri:/dev/dri group_add: - - video + - ${VIDEOGROUPID:-44} + - ${RENDERGROUPID:-109} ecrag: image: ${REGISTRY:-opea}/edgecraftrag:${TAG:-latest} container_name: edgecraftrag @@ -48,31 +51,42 @@ services: PIPELINE_SERVICE_HOST_IP: ${PIPELINE_SERVICE_HOST_IP:-${HOST_IP}} UI_SERVICE_PORT: ${UI_SERVICE_PORT:-8082} UI_SERVICE_HOST_IP: ${UI_SERVICE_HOST_IP:-0.0.0.0} + volumes: + - ${GRADIO_PATH:-${PWD}}:/home/user/gradio_cache ports: - - ${UI_SERVICE_PORT:-8082}:${UI_SERVICE_PORT:-8082} + - ${UI_SERVICE_PORT:-8082}:${UI_SERVICE_PORT:-8082} restart: always depends_on: - server - ecrag - # vllm-service: - # image: vllm:openvino + # vllm-openvino-server: # container_name: vllm-openvino-server + # image: opea/vllm-arc:latest # ports: - # - "8008:80" + # - ${VLLM_SERVICE_PORT:-8008}:80 # environment: - # no_proxy: ${no_proxy} - # http_proxy: ${http_proxy} - # https_proxy: ${https_proxy} - # vLLM_ENDPOINT: ${vLLM_ENDPOINT} - # LLM_MODEL: ${LLM_MODEL} + # HTTPS_PROXY: ${https_proxy} + # HTTP_PROXY: ${https_proxy} + # VLLM_OPENVINO_DEVICE: GPU + # HF_ENDPOINT: ${HF_ENDPOINT} + # HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + # volumes: + # - /dev/dri/by-path:/dev/dri/by-path + # - $HOME/.cache/huggingface:/root/.cache/huggingface + # devices: + # - /dev/dri + # group_add: + # - ${VIDEOGROUPID:-44} + # - ${RENDERGROUPID:-109} # entrypoint: /bin/bash -c "\ # cd / && \ # export VLLM_CPU_KVCACHE_SPACE=50 && \ + # export VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON && \ # python3 -m vllm.entrypoints.openai.api_server \ # --model '${LLM_MODEL}' \ + # --max_model_len=1024 \ # --host 0.0.0.0 \ # --port 80" - networks: default: driver: bridge diff --git a/EdgeCraftRAG/edgecraftrag/api/v1/chatqna.py b/EdgeCraftRAG/edgecraftrag/api/v1/chatqna.py index dfd32c29e..8249950d0 100644 --- a/EdgeCraftRAG/edgecraftrag/api/v1/chatqna.py +++ b/EdgeCraftRAG/edgecraftrag/api/v1/chatqna.py @@ -25,5 +25,8 @@ async def retrieval(request: ChatCompletionRequest): # ChatQnA @chatqna_app.post(path="/v1/chatqna") async def chatqna(request: ChatCompletionRequest): - ret = ctx.get_pipeline_mgr().run_pipeline(chat_request=request) - return str(ret) + if request.stream: + return ctx.get_pipeline_mgr().run_pipeline(chat_request=request) + else: + ret = ctx.get_pipeline_mgr().run_pipeline(chat_request=request) + return str(ret) diff --git a/EdgeCraftRAG/edgecraftrag/api/v1/pipeline.py b/EdgeCraftRAG/edgecraftrag/api/v1/pipeline.py index 9d008e82f..f58390cfd 100644 --- a/EdgeCraftRAG/edgecraftrag/api/v1/pipeline.py +++ b/EdgeCraftRAG/edgecraftrag/api/v1/pipeline.py @@ -157,16 +157,13 @@ def update_pipeline_handler(pl, req): gen = req.generator if gen.model is None: return "No ChatQnA Model" - if gen.inference_type == InferenceType.VLLM: - if gen.model.model_id: - model_ref = gen.model.model_id - else: - model_ref = gen.model.model_path - pl.generator = QnAGenerator(model_ref, gen.prompt_path, gen.inference_type) - elif gen.inference_type == InferenceType.LOCAL: + if gen.inference_type: model = ctx.get_model_mgr().search_model(gen.model) if model is None: - gen.model.model_type = ModelType.LLM + if gen.inference_type == InferenceType.VLLM: + gen.model.model_type = ModelType.VLLM + else: + gen.model.model_type = ModelType.LLM model = ctx.get_model_mgr().load_model(gen.model) ctx.get_model_mgr().add(model) # Use weakref to achieve model deletion and memory release diff --git a/EdgeCraftRAG/edgecraftrag/api_schema.py b/EdgeCraftRAG/edgecraftrag/api_schema.py index 1f124a7f9..5927e0304 100644 --- a/EdgeCraftRAG/edgecraftrag/api_schema.py +++ b/EdgeCraftRAG/edgecraftrag/api_schema.py @@ -10,6 +10,7 @@ class ModelIn(BaseModel): model_type: Optional[str] = "LLM" model_id: Optional[str] model_path: Optional[str] = "./" + weight: Optional[str] device: Optional[str] = "cpu" diff --git a/EdgeCraftRAG/edgecraftrag/base.py b/EdgeCraftRAG/edgecraftrag/base.py index d8c7aaef8..a163c486f 100644 --- a/EdgeCraftRAG/edgecraftrag/base.py +++ b/EdgeCraftRAG/edgecraftrag/base.py @@ -27,6 +27,7 @@ class ModelType(str, Enum): EMBEDDING = "embedding" RERANKER = "reranker" LLM = "llm" + VLLM = "vllm" class FileType(str, Enum): diff --git a/EdgeCraftRAG/edgecraftrag/components/generator.py b/EdgeCraftRAG/edgecraftrag/components/generator.py index cbfd6686d..a888bf18f 100644 --- a/EdgeCraftRAG/edgecraftrag/components/generator.py +++ b/EdgeCraftRAG/edgecraftrag/components/generator.py @@ -1,10 +1,11 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 +import asyncio import dataclasses import os -from comps import GeneratedDoc, opea_telemetry +from comps import GeneratedDoc from edgecraftrag.base import BaseComponent, CompType, GeneratorType from fastapi.responses import StreamingResponse from langchain_core.prompts import PromptTemplate @@ -12,18 +13,6 @@ from pydantic import model_serializer -@opea_telemetry -def post_process_text(text: str): - if text == " ": - return "data: @#$\n\n" - if text == "\n": - return "data:
\n\n" - if text.isspace(): - return None - new_text = text.replace(" ", "@#$") - return f"data: {new_text}\n\n" - - class QnAGenerator(BaseComponent): def __init__(self, llm_model, prompt_template, inference_type, **kwargs): @@ -76,8 +65,18 @@ def run(self, chat_request, retrieved_nodes, **kwargs): repetition_penalty=chat_request.repetition_penalty, ) self.llm().generate_kwargs = generate_kwargs + if chat_request.stream: + + async def stream_generator(): + response = self.llm().stream_complete(prompt_str) + for r in response: + yield r.delta + # Simulate asynchronous operation + await asyncio.sleep(0.01) - return self.llm().complete(prompt_str) + return StreamingResponse(stream_generator(), media_type="text/event-stream") + else: + return self.llm().complete(prompt_str) def run_vllm(self, chat_request, retrieved_nodes, **kwargs): if self.llm is None: @@ -92,7 +91,7 @@ def run_vllm(self, chat_request, retrieved_nodes, **kwargs): prompt_str = self.prompt.format(input=query, context=text_gen_context) llm_endpoint = os.getenv("vLLM_ENDPOINT", "http://localhost:8008") - model_name = self.llm + model_name = self.llm().model_id llm = OpenAILike( api_key="fake", api_base=llm_endpoint + "/v1", @@ -106,12 +105,10 @@ def run_vllm(self, chat_request, retrieved_nodes, **kwargs): if chat_request.stream: async def stream_generator(): - response = await llm.astream_complete(prompt_str) - async for text in response: - output = text.text - yield f"data: {output}\n\n" - - yield "data: [DONE]\n\n" + response = llm.stream_complete(prompt_str) + for text in response: + yield text.delta + await asyncio.sleep(0.01) return StreamingResponse(stream_generator(), media_type="text/event-stream") else: @@ -122,7 +119,12 @@ async def stream_generator(): @model_serializer def ser_model(self): - set = {"idx": self.idx, "generator_type": self.comp_subtype, "model": self.model_id} + set = { + "idx": self.idx, + "generator_type": self.comp_subtype, + "inference_type": self.inference_type, + "model": self.llm(), + } return set diff --git a/EdgeCraftRAG/edgecraftrag/components/model.py b/EdgeCraftRAG/edgecraftrag/components/model.py index 72ee7f16e..75fa69c41 100644 --- a/EdgeCraftRAG/edgecraftrag/components/model.py +++ b/EdgeCraftRAG/edgecraftrag/components/model.py @@ -14,6 +14,7 @@ class BaseModelComponent(BaseComponent): model_id: Optional[str] = Field(default="") model_path: Optional[str] = Field(default="") + weight: Optional[str] = Field(default="") device: Optional[str] = Field(default="cpu") def run(self, **kwargs) -> Any: @@ -26,6 +27,7 @@ def ser_model(self): "type": self.comp_subtype, "model_id": self.model_id, "model_path": self.model_path, + "weight": self.weight, "device": self.device, } return set @@ -33,7 +35,7 @@ def ser_model(self): class OpenVINOEmbeddingModel(BaseModelComponent, OpenVINOEmbedding): - def __init__(self, model_id, model_path, device): + def __init__(self, model_id, model_path, device, weight): OpenVINOEmbedding.create_and_save_openvino_model(model_id, model_path) OpenVINOEmbedding.__init__(self, model_id_or_path=model_path, device=device) self.comp_type = CompType.MODEL @@ -41,11 +43,12 @@ def __init__(self, model_id, model_path, device): self.model_id = model_id self.model_path = model_path self.device = device + self.weight = "" class OpenVINORerankModel(BaseModelComponent, OpenVINORerank): - def __init__(self, model_id, model_path, device): + def __init__(self, model_id, model_path, device, weight): OpenVINORerank.create_and_save_openvino_model(model_id, model_path) OpenVINORerank.__init__( self, @@ -57,11 +60,12 @@ def __init__(self, model_id, model_path, device): self.model_id = model_id self.model_path = model_path self.device = device + self.weight = "" class OpenVINOLLMModel(BaseModelComponent, OpenVINOLLM): - def __init__(self, model_id, model_path, device): + def __init__(self, model_id, model_path, device, weight): OpenVINOLLM.__init__( self, model_id_or_path=model_path, @@ -72,3 +76,4 @@ def __init__(self, model_id, model_path, device): self.model_id = model_id self.model_path = model_path self.device = device + self.weight = weight diff --git a/EdgeCraftRAG/edgecraftrag/components/pipeline.py b/EdgeCraftRAG/edgecraftrag/components/pipeline.py index 4a2932e00..5af8b5cbe 100644 --- a/EdgeCraftRAG/edgecraftrag/components/pipeline.py +++ b/EdgeCraftRAG/edgecraftrag/components/pipeline.py @@ -110,8 +110,10 @@ def model_existed(self, model_id: str) -> bool: return True if self.generator: llm = self.generator.llm - if llm() and llm().model_id == model_id: - return True + if isinstance(llm, str): + return llm == model_id + else: + return llm().model_id == model_id return False @@ -154,7 +156,8 @@ def run_test_generator(pl: Pipeline, chat_request: ChatCompletionRequest) -> Any if pl.generator is None: return "No Generator Specified" if pl.generator.inference_type == InferenceType.LOCAL: - answer = pl.generator.run(chat_request, retri_res) + return pl.generator.run(chat_request, retri_res) elif pl.generator.inference_type == InferenceType.VLLM: - answer = pl.generator.run_vllm(chat_request, retri_res) - return answer + return pl.generator.run_vllm(chat_request, retri_res) + else: + return "LLM inference_type not supported" diff --git a/EdgeCraftRAG/edgecraftrag/controllers/modelmgr.py b/EdgeCraftRAG/edgecraftrag/controllers/modelmgr.py index 73a77e48a..6d0166bc5 100644 --- a/EdgeCraftRAG/edgecraftrag/controllers/modelmgr.py +++ b/EdgeCraftRAG/edgecraftrag/controllers/modelmgr.py @@ -3,9 +3,14 @@ import asyncio -from edgecraftrag.api_schema import IndexerIn, ModelIn, NodeParserIn -from edgecraftrag.base import BaseComponent, BaseMgr, CallbackType, ModelType -from edgecraftrag.components.model import OpenVINOEmbeddingModel, OpenVINOLLMModel, OpenVINORerankModel +from edgecraftrag.api_schema import ModelIn +from edgecraftrag.base import BaseComponent, BaseMgr, CompType, ModelType +from edgecraftrag.components.model import ( + BaseModelComponent, + OpenVINOEmbeddingModel, + OpenVINOLLMModel, + OpenVINORerankModel, +) class ModelMgr(BaseMgr): @@ -78,17 +83,25 @@ def load_model(model_para: ModelIn): model_id=model_para.model_id, model_path=model_para.model_path, device=model_para.device, + weight=model_para.weight, ) case ModelType.RERANKER: model = OpenVINORerankModel( model_id=model_para.model_id, model_path=model_para.model_path, device=model_para.device, + weight=model_para.weight, ) case ModelType.LLM: model = OpenVINOLLMModel( model_id=model_para.model_id, model_path=model_para.model_path, device=model_para.device, + weight=model_para.weight, ) + case ModelType.VLLM: + model = BaseModelComponent(model_id=model_para.model_id, model_path="", device="", weight="") + model.comp_type = CompType.MODEL + model.comp_subtype = ModelType.VLLM + model.model_id_or_path = model_para.model_id return model diff --git a/EdgeCraftRAG/edgecraftrag/prompt_template/default_prompt.txt b/EdgeCraftRAG/edgecraftrag/prompt_template/default_prompt.txt index 800d1fa2f..aa57e6059 100644 --- a/EdgeCraftRAG/edgecraftrag/prompt_template/default_prompt.txt +++ b/EdgeCraftRAG/edgecraftrag/prompt_template/default_prompt.txt @@ -5,4 +5,4 @@ <|im_start|>System: Pay attention to your formatting of response. If you need to reference content from context, try to keep the formatting.<|im_end|> <|im_start|>System: Try to summarize from the context, do some reasoning before response, then response. Make sure your response is logically sound and self-consistent.<|im_end|> -<|im_start|>{input} +<|im_start|>{input} \ No newline at end of file diff --git a/EdgeCraftRAG/edgecraftrag/requirements.txt b/EdgeCraftRAG/edgecraftrag/requirements.txt index 3756c732a..6757aa752 100644 --- a/EdgeCraftRAG/edgecraftrag/requirements.txt +++ b/EdgeCraftRAG/edgecraftrag/requirements.txt @@ -1,6 +1,5 @@ docx2txt faiss-cpu>=1.8.0.post1 -gradio>=4.44.1 langchain-core==0.2.29 llama-index>=0.11.0 llama-index-embeddings-openvino>=0.4.0 @@ -9,8 +8,4 @@ llama-index-llms-openvino>=0.3.1 llama-index-postprocessor-openvino-rerank>=0.3.0 llama-index-retrievers-bm25>=0.3.0 llama-index-vector-stores-faiss>=0.2.1 -loguru>=0.7.2 -omegaconf>=2.3.0 opea-comps>=0.9 -py-cpuinfo>=9.0.0 -uvicorn>=0.30.6 diff --git a/EdgeCraftRAG/requirements.txt b/EdgeCraftRAG/requirements.txt new file mode 100644 index 000000000..5b27f1434 --- /dev/null +++ b/EdgeCraftRAG/requirements.txt @@ -0,0 +1,2 @@ +fastapi>=0.115.0 +opea-comps>=0.9 diff --git a/EdgeCraftRAG/tests/test_pipeline_local_llm.json b/EdgeCraftRAG/tests/test_pipeline_local_llm.json index 18895d6e5..13485cebc 100644 --- a/EdgeCraftRAG/tests/test_pipeline_local_llm.json +++ b/EdgeCraftRAG/tests/test_pipeline_local_llm.json @@ -9,7 +9,6 @@ "indexer_type": "faiss_vector", "embedding_model": { "model_id": "BAAI/bge-small-en-v1.5", - "model_path": "./models/bge_ov_embedding", "device": "auto" } }, @@ -23,7 +22,6 @@ "top_n": 2, "reranker_model": { "model_id": "BAAI/bge-reranker-large", - "model_path": "./models/bge_ov_reranker", "device": "auto" } } @@ -31,7 +29,6 @@ "generator": { "model": { "model_id": "Qwen/Qwen2-7B-Instruct", - "model_path": "./models/qwen2-7b-instruct/INT4_compressed_weights", "device": "cpu" }, "prompt_path": "./edgecraftrag/prompt_template/default_prompt.txt", diff --git a/EdgeCraftRAG/ui/docker/Dockerfile.ui b/EdgeCraftRAG/ui/docker/Dockerfile.ui index 46a14a6e9..3dacb35d8 100644 --- a/EdgeCraftRAG/ui/docker/Dockerfile.ui +++ b/EdgeCraftRAG/ui/docker/Dockerfile.ui @@ -11,10 +11,11 @@ RUN useradd -m -s /bin/bash user && \ COPY ./ui/gradio /home/user/ui COPY ./edgecraftrag /home/user/edgecraftrag -WORKDIR /home/user/edgecraftrag -RUN pip install --no-cache-dir -r requirements.txt +RUN mkdir -p /home/user/gradio_cache +ENV GRADIO_TEMP_DIR=/home/user/gradio_cache WORKDIR /home/user/ui +RUN pip install --no-cache-dir -r requirements.txt USER user diff --git a/EdgeCraftRAG/ui/gradio/default.yaml b/EdgeCraftRAG/ui/gradio/default.yaml index 1421da8f4..39c3ee92e 100644 --- a/EdgeCraftRAG/ui/gradio/default.yaml +++ b/EdgeCraftRAG/ui/gradio/default.yaml @@ -3,7 +3,6 @@ # Model language for LLM model_language: "Chinese" -vector_db: "FAISS" splitter_name: "RecursiveCharacter" k_rerank: 5 search_method: "similarity" @@ -29,21 +28,19 @@ k_retrieval: 30 postprocessor: "reranker" # Generator -generator: "local" -prompt_path: "./data/default_prompt.txt" +generator: "chatqna" +prompt_path: "./edgecraftrag/prompt_template/default_prompt.txt" # Models embedding_model_id: "BAAI/bge-small-en-v1.5" -embedding_model_path: "./bge_ov_embedding" # Device for embedding model inference embedding_device: "AUTO" rerank_model_id: "BAAI/bge-reranker-large" -rerank_model_path: "./bge_ov_reranker" # Device for reranking model inference rerank_device: "AUTO" -llm_model_id: "qwen2-7b-instruct" -llm_model_path: "./qwen2-7b-instruct/INT4_compressed_weights" +llm_model_id: "Qwen/Qwen2-7B-Instruct" +llm_weights: "INT4" # Device for LLM model inference llm_device: "AUTO" diff --git a/EdgeCraftRAG/ui/gradio/ecrag_client.py b/EdgeCraftRAG/ui/gradio/ecrag_client.py index 47b5f776d..6593cbd94 100644 --- a/EdgeCraftRAG/ui/gradio/ecrag_client.py +++ b/EdgeCraftRAG/ui/gradio/ecrag_client.py @@ -1,13 +1,13 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 +import os import sys +import platform_config as pconf import requests sys.path.append("..") -import os - from edgecraftrag import api_schema PIPELINE_SERVICE_HOST_IP = os.getenv("PIPELINE_SERVICE_HOST_IP", "127.0.0.1") @@ -42,6 +42,7 @@ def create_update_pipeline( vector_search_top_k, postprocessor, generator, + llm_infertype, llm_id, llm_device, llm_weights, @@ -50,6 +51,7 @@ def create_update_pipeline( rerank_id, rerank_device, ): + llm_path = pconf.get_llm_model_dir("./models/", llm_id, llm_weights) req_dict = api_schema.PipelineCreateIn( name=name, active=active, @@ -60,9 +62,9 @@ def create_update_pipeline( indexer_type=indexer, embedding_model=api_schema.ModelIn( model_id=embedding_id, - # TODO: remove hardcoding - model_path="./bge_ov_embedding", + model_path="./models/" + embedding_id, device=embedding_device, + weight=llm_weights, ), ), retriever=api_schema.RetrieverIn(retriever_type=retriever, retriever_topk=vector_search_top_k), @@ -70,22 +72,15 @@ def create_update_pipeline( api_schema.PostProcessorIn( processor_type=postprocessor[0], reranker_model=api_schema.ModelIn( - model_id=rerank_id, - # TODO: remove hardcoding - model_path="./bge_ov_reranker", - device=rerank_device, + model_id=rerank_id, model_path="./models/" + rerank_id, device=rerank_device, weight=llm_weights ), ) ], generator=api_schema.GeneratorIn( # TODO: remove hardcoding prompt_path="./edgecraftrag/prompt_template/default_prompt.txt", - model=api_schema.ModelIn( - model_id=llm_id, - # TODO: remove hardcoding - model_path="./models/qwen2-7b-instruct/INT4_compressed_weights", - device=llm_device, - ), + model=api_schema.ModelIn(model_id=llm_id, model_path=llm_path, device=llm_device, weight=llm_weights), + inference_type=llm_infertype, ), ) # hard code only for test @@ -105,7 +100,7 @@ def activate_pipeline(name): return restext, status -def create_vectordb(docs, spliter, vector_db): +def create_vectordb(docs, spliter): req_dict = api_schema.FilesIn(local_paths=docs) res = requests.post(f"{server_addr}/v1/data/files", json=req_dict.dict(), proxies={"http": None}) return res.text @@ -116,6 +111,8 @@ def get_files(): files = [] for file in res.json(): files.append((file["file_name"], file["file_id"])) + if not files: + files.append((None, None)) return files diff --git a/EdgeCraftRAG/ui/gradio/ecragui.py b/EdgeCraftRAG/ui/gradio/ecragui.py index 3c198bf2a..23a5286de 100644 --- a/EdgeCraftRAG/ui/gradio/ecragui.py +++ b/EdgeCraftRAG/ui/gradio/ecragui.py @@ -2,11 +2,9 @@ # SPDX-License-Identifier: Apache-2.0 import argparse -import json +import os import platform -import re from datetime import datetime -from pathlib import Path import cpuinfo import distro # if running Python 3.8 or above @@ -17,41 +15,22 @@ # Creation of the ModelLoader instance and loading models remain the same import platform_config as pconf import psutil -import requests from loguru import logger from omegaconf import OmegaConf -from platform_config import get_available_devices, get_available_weights, get_local_available_models +from platform_config import ( + get_avail_llm_inference_type, + get_available_devices, + get_available_weights, + get_local_available_models, +) pipeline_df = [] -import os MEGA_SERVICE_HOST_IP = os.getenv("MEGA_SERVICE_HOST_IP", "127.0.0.1") MEGA_SERVICE_PORT = int(os.getenv("MEGA_SERVICE_PORT", 16011)) UI_SERVICE_HOST_IP = os.getenv("UI_SERVICE_HOST_IP", "0.0.0.0") -UI_SERVICE_PORT = int(os.getenv("UI_SERVICE_PORT", 8084)) - - -def get_llm_model_dir(llm_model_id, weights_compression): - model_dirs = { - "fp16_model_dir": Path(llm_model_id) / "FP16", - "int8_model_dir": Path(llm_model_id) / "INT8_compressed_weights", - "int4_model_dir": Path(llm_model_id) / "INT4_compressed_weights", - } - - if weights_compression == "INT4": - model_dir = model_dirs["int4_model_dir"] - elif weights_compression == "INT8": - model_dir = model_dirs["int8_model_dir"] - else: - model_dir = model_dirs["fp16_model_dir"] - - if not model_dir.exists(): - raise FileNotFoundError(f"The model directory {model_dir} does not exist.") - elif not model_dir.is_dir(): - raise NotADirectoryError(f"The path {model_dir} is not a directory.") - - return model_dir +UI_SERVICE_PORT = int(os.getenv("UI_SERVICE_PORT", 8082)) def get_system_status(): @@ -87,31 +66,7 @@ def get_system_status(): return status -def build_demo(cfg, args): - - def load_chatbot_models( - llm_id, - llm_device, - llm_weights, - embedding_id, - embedding_device, - rerank_id, - rerank_device, - ): - req_dict = { - "llm_id": llm_id, - "llm_device": llm_device, - "llm_weights": llm_weights, - "embedding_id": embedding_id, - "embedding_device": embedding_device, - "rerank_id": rerank_id, - "rerank_device": rerank_device, - } - # hard code only for test - worker_addr = "http://127.0.0.1:8084" - print(req_dict) - result = requests.post(f"{worker_addr}/load", json=req_dict, proxies={"http": None}) - return result.text +def build_app(cfg, args): def user(message, history): """Callback function for updating user messages in interface on submit button click. @@ -131,11 +86,9 @@ async def bot( top_p, top_k, repetition_penalty, + max_tokens, hide_full_prompt, - do_rag, docs, - spliter_name, - vector_db, chunk_size, chunk_overlap, vector_search_top_k, @@ -155,41 +108,16 @@ async def bot( repetition_penalty: parameter for penalizing tokens based on how frequently they occur in the text. conversation_id: unique conversation identifier. """ - # req_dict = { - # "history": history, - # "temperature": temperature, - # "top_p": top_p, - # "top_k": top_k, - # "repetition_penalty": repetition_penalty, - # "hide_full_prompt": hide_full_prompt, - # "do_rag": do_rag, - # "docs": docs, - # "spliter_name": spliter_name, - # "vector_db": vector_db, - # "chunk_size": chunk_size, - # "chunk_overlap": chunk_overlap, - # "vector_search_top_k": vector_search_top_k, - # "vector_search_top_n": vector_search_top_n, - # "run_rerank": run_rerank, - # "search_method": search_method, - # "score_threshold": score_threshold, - # "streaming": True - # } - print(history) - new_req = {"messages": history[-1][0]} + stream_opt = True + new_req = {"messages": history[-1][0], "stream": stream_opt, "max_tokens": max_tokens} server_addr = f"http://{MEGA_SERVICE_HOST_IP}:{MEGA_SERVICE_PORT}" # Async for streaming response partial_text = "" async with httpx.AsyncClient() as client: async with client.stream("POST", f"{server_addr}/v1/chatqna", json=new_req, timeout=None) as response: - partial_text = "" - async for chunk in response.aiter_lines(): - new_text = chunk - if new_text.startswith("data"): - new_text = re.sub(r"\r\n", "", chunk.split("data: ")[-1]) - new_text = json.loads(chunk)["choices"][0]["message"]["content"] - partial_text = partial_text + new_text + async for chunk in response.aiter_text(): + partial_text = partial_text + chunk history[-1][1] = partial_text yield history @@ -198,6 +126,7 @@ async def bot( avail_rerank_models = get_local_available_models("rerank") avail_devices = get_available_devices() avail_weights_compression = get_available_weights() + avail_llm_inference_type = get_avail_llm_inference_type() avail_node_parsers = pconf.get_available_node_parsers() avail_indexers = pconf.get_available_indexers() avail_retrievers = pconf.get_available_retrievers() @@ -212,7 +141,7 @@ async def bot( .disclaimer {font-variant-caps: all-small-caps} """ - with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo: + with gr.Blocks(theme=gr.themes.Soft(), css=css) as app: gr.HTML( """ @@ -250,7 +179,7 @@ async def bot(

Edge Craft RAG based Q&A Chatbot

-
Powered by Intel NEXC Edge AI solutions
+
Powered by Intel
@@ -295,7 +224,6 @@ def get_pipeline_df(): with gr.Row(): rag_create_pipeline = gr.Button("Create Pipeline") rag_activate_pipeline = gr.Button("Activate Pipeline") - rag_remove_pipeline = gr.Button("Remove Pipeline") with gr.Column(variant="panel"): u_pipeline_name = gr.Textbox( @@ -366,6 +294,7 @@ def get_pipeline_df(): label="Embedding run device", # info="Run embedding model on which device?", multiselect=False, + interactive=True, ) with gr.Column(variant="panel"): @@ -415,6 +344,7 @@ def get_pipeline_df(): label="Rerank run device", # info="Run rerank model on which device?", multiselect=False, + interactive=True, ) with gr.Column(variant="panel"): @@ -428,6 +358,10 @@ def get_pipeline_df(): interactive=True, ) + u_llm_infertype = gr.Radio( + choices=avail_llm_inference_type, label="LLM Inference Type", value="local" + ) + with gr.Accordion("LLM Configuration", open=True): u_llm_model_id = gr.Dropdown( choices=avail_llms, @@ -444,12 +378,15 @@ def get_pipeline_df(): label="LLM run device", # info="Run LLM on which device?", multiselect=False, + interactive=True, ) u_llm_weights = gr.Radio( avail_weights_compression, label="Weights", info="weights compression", + value=cfg.llm_weights, + interactive=True, ) # ------------------- @@ -460,14 +397,9 @@ def show_pipeline_detail(evt: gr.SelectData): # get selected pipeline id # Dataframe: {'headers': '', 'data': [[x00, x01], [x10, x11]} # SelectData.index: [i, j] - print(u_pipelines.value["data"]) - print(evt.index) # always use pipeline id for indexing selected_id = pipeline_df[evt.index[0]][0] pl = cli.get_pipeline(selected_id) - # TODO: change to json fomart - # pl["postprocessor"][0]["processor_type"] - # pl["postprocessor"]["model"]["model_id"], pl["postprocessor"]["model"]["device"] return ( pl["name"], pl["status"]["active"], @@ -477,12 +409,16 @@ def show_pipeline_detail(evt: gr.SelectData): pl["indexer"]["indexer_type"], pl["retriever"]["retriever_type"], pl["retriever"]["retrieve_topk"], + pl["postprocessor"][0]["postprocessor_type"], pl["generator"]["generator_type"], + pl["generator"]["inference_type"], pl["generator"]["model"]["model_id"], pl["generator"]["model"]["device"], - "", + pl["generator"]["model"]["weight"], pl["indexer"]["model"]["model_id"], pl["indexer"]["model"]["device"], + pl["postprocessor"][0]["model"]["model_id"] if pl["postprocessor"][0]["model"] is not None else "", + pl["postprocessor"][0]["model"]["device"] if pl["postprocessor"][0]["model"] is not None else "", ) def modify_create_pipeline_button(): @@ -502,6 +438,7 @@ def create_update_pipeline( vector_search_top_k, postprocessor, generator, + llm_infertype, llm_id, llm_device, llm_weights, @@ -521,6 +458,7 @@ def create_update_pipeline( vector_search_top_k, postprocessor, generator, + llm_infertype, llm_id, llm_device, llm_weights, @@ -548,17 +486,18 @@ def create_update_pipeline( u_retriever, u_vector_search_top_k, # postprocessor - # u_postprocessor, + u_postprocessor, # generator u_generator, + u_llm_infertype, # models u_llm_model_id, u_llm_device, u_llm_weights, u_embed_model_id, u_embed_device, - # u_rerank_model_id, - # u_rerank_device + u_rerank_model_id, + u_rerank_device, ], ) @@ -586,6 +525,7 @@ def create_update_pipeline( u_llm_model_id.input, u_llm_device.input, u_llm_weights.input, + u_llm_infertype.input, u_embed_model_id.input, u_embed_device.input, u_rerank_model_id.input, @@ -609,6 +549,7 @@ def create_update_pipeline( u_vector_search_top_k, u_postprocessor, u_generator, + u_llm_infertype, u_llm_model_id, u_llm_device, u_llm_weights, @@ -634,8 +575,8 @@ def create_update_pipeline( def get_files(): return cli.get_files() - def create_vectordb(docs, spliter, vector_db): - res = cli.create_vectordb(docs, spliter, vector_db) + def create_vectordb(docs, spliter): + res = cli.create_vectordb(docs, spliter) return gr.update(value=get_files()), res global u_files_selected_row @@ -696,13 +637,6 @@ def delete_file(): multiselect=False, ) - vector_db = gr.Dropdown( - ["FAISS", "Chroma"], - value=cfg.vector_db, - label="Vector Stores", - info="Stores embedded data and performs vector search.", - multiselect=False, - ) load_docs = gr.Button("Upload files") u_files_status = gr.Textbox(label="File Processing Status", value="", interactive=False) @@ -723,12 +657,6 @@ def delete_file(): with gr.Column(): deselect_button = gr.Button("Clear Selection") - do_rag = gr.Checkbox( - value=True, - label="RAG is ON", - interactive=True, - info="Whether to do RAG for generation", - ) with gr.Accordion("Generation Configuration", open=False): with gr.Row(): with gr.Column(): @@ -778,6 +706,17 @@ def delete_file(): interactive=True, info="Penalize repetition — 1.0 to disable.", ) + with gr.Column(): + with gr.Row(): + u_max_tokens = gr.Slider( + label="Max Token Number", + value=512, + minimum=1, + maximum=8192, + step=10, + interactive=True, + info="Set Max Output Token", + ) with gr.Column(scale=4): chatbot = gr.Chatbot( height=600, @@ -795,7 +734,6 @@ def delete_file(): with gr.Column(): with gr.Row(): submit = gr.Button("Submit") - stop = gr.Button("Stop") clear = gr.Button("Clear") retriever_argument = gr.Accordion("Retriever Configuration", open=True) with retriever_argument: @@ -845,7 +783,6 @@ def delete_file(): inputs=[ docs, spliter, - vector_db, ], outputs=[u_files, u_files_status], queue=True, @@ -873,11 +810,9 @@ def delete_file(): top_p, top_k, repetition_penalty, + u_max_tokens, hide_context, - do_rag, docs, - spliter, - vector_db, u_chunk_size, u_chunk_overlap, u_vector_search_top_k, @@ -897,11 +832,9 @@ def delete_file(): top_p, top_k, repetition_penalty, + u_max_tokens, hide_context, - do_rag, docs, - spliter, - vector_db, u_chunk_size, u_chunk_overlap, u_vector_search_top_k, @@ -913,15 +846,8 @@ def delete_file(): chatbot, queue=True, ) - # stop.click( - # fn=request_cancel, - # inputs=None, - # outputs=None, - # cancels=[submit_event, submit_click_event], - # queue=False, - # ) clear.click(lambda: None, None, chatbot, queue=False) - return demo + return app def main(): @@ -929,8 +855,6 @@ def main(): parser = argparse.ArgumentParser(description="Load Embedding and LLM Models with OpenVino.") # Add the arguments parser.add_argument("--prompt_template", type=str, required=False, help="User specific template") - # parser.add_argument("--server_name", type=str, default="0.0.0.0") - # parser.add_argument("--server_port", type=int, default=8082) parser.add_argument("--config", type=str, default="./default.yaml", help="configuration file path") parser.add_argument("--share", action="store_true", help="share model") parser.add_argument("--debug", action="store_true", help="enable debugging") @@ -942,20 +866,20 @@ def main(): init_cfg_(cfg) logger.info(cfg) - demo = build_demo(cfg, args) + app = build_app(cfg, args) # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') + # app.launch(server_name='your server name', server_port='server port in int') # if you have any issue to launch on your platform, you can pass share=True to launch method: - # demo.launch(share=True) + # app.launch(share=True) # it creates a publicly shareable link for the interface. Read more in the docs: https://gradio.app/docs/ - # demo.launch(share=True) - demo.queue().launch( + # app.launch(share=True) + app.queue().launch( server_name=UI_SERVICE_HOST_IP, server_port=UI_SERVICE_PORT, share=args.share, allowed_paths=["."] ) # %% # please run this cell for stopping gradio interface - demo.close() + app.close() def init_cfg_(cfg): @@ -969,14 +893,14 @@ def init_cfg_(cfg): cfg.llm_device = "CPU" if "model_language" not in cfg: cfg.model_language = "Chinese" - if "vector_db" not in cfg: - cfg.vector_db = "FAISS" if "splitter_name" not in cfg: cfg.splitter_name = "RecursiveCharacter" # or "Chinese" if "search_method" not in cfg: cfg.search_method = "similarity" if "score_threshold" not in cfg: cfg.score_threshold = 0.5 + if "llm_weights" not in cfg: + cfg.llm_weights = "FP16" if __name__ == "__main__": diff --git a/EdgeCraftRAG/ui/gradio/platform_config.py b/EdgeCraftRAG/ui/gradio/platform_config.py index 852409c1c..3fc3155f9 100644 --- a/EdgeCraftRAG/ui/gradio/platform_config.py +++ b/EdgeCraftRAG/ui/gradio/platform_config.py @@ -90,6 +90,11 @@ def get_available_weights(): return avail_weights_compression +def get_avail_llm_inference_type(): + avail_llm_inference_type = ["local", "vllm"] + return avail_llm_inference_type + + def get_enum_values(c: Enum): return [v.value for k, v in vars(c).items() if not callable(v) and not k.startswith("__") and not k.startswith("_")] @@ -112,3 +117,25 @@ def get_available_postprocessors(): def get_available_generators(): return get_enum_values(GeneratorType) + + +def get_llm_model_dir(prefix, llm_model_id, weights_compression): + model_dirs = { + "fp16_model_dir": prefix + llm_model_id + "/FP16", + "int8_model_dir": prefix + llm_model_id + "/INT8_compressed_weights", + "int4_model_dir": prefix + llm_model_id + "/INT4_compressed_weights", + } + + if weights_compression == "INT4": + model_dir = model_dirs["int4_model_dir"] + elif weights_compression == "INT8": + model_dir = model_dirs["int8_model_dir"] + else: + model_dir = model_dirs["fp16_model_dir"] + + # if not model_dir.exists(): + # raise FileNotFoundError(f"The model directory {model_dir} does not exist.") + # elif not model_dir.is_dir(): + # raise NotADirectoryError(f"The path {model_dir} is not a directory.") + + return model_dir diff --git a/EdgeCraftRAG/ui/gradio/requirements.txt b/EdgeCraftRAG/ui/gradio/requirements.txt new file mode 100644 index 000000000..22bcd0524 --- /dev/null +++ b/EdgeCraftRAG/ui/gradio/requirements.txt @@ -0,0 +1,8 @@ +distro>=1.9.0 +gradio>=4.44.1 +loguru>=0.7.2 +omegaconf>=2.3.0 +openvino>=2024.4.0 +psutil>=6.1.0 +py-cpuinfo>=9.0.0 +uvicorn>=0.30.6