diff --git a/EdgeCraftRAG/Dockerfile b/EdgeCraftRAG/Dockerfile
index 3c9711dea..2e6191a01 100644
--- a/EdgeCraftRAG/Dockerfile
+++ b/EdgeCraftRAG/Dockerfile
@@ -13,13 +13,11 @@ RUN useradd -m -s /bin/bash user && \
     mkdir -p /home/user && \
     chown -R user /home/user/
 
-COPY ./edgecraftrag /home/user/edgecraftrag
+COPY ./requirements.txt /home/user/requirements.txt
 COPY ./chatqna.py /home/user/chatqna.py
 
-WORKDIR /home/user/edgecraftrag
-RUN pip install --no-cache-dir -r requirements.txt
-
 WORKDIR /home/user
+RUN pip install --no-cache-dir -r requirements.txt
 
 USER user
 
diff --git a/EdgeCraftRAG/Dockerfile.server b/EdgeCraftRAG/Dockerfile.server
index c04dc0a54..f076dcd16 100644
--- a/EdgeCraftRAG/Dockerfile.server
+++ b/EdgeCraftRAG/Dockerfile.server
@@ -25,6 +25,9 @@ RUN useradd -m -s /bin/bash user && \
 
 COPY ./edgecraftrag /home/user/edgecraftrag
 
+RUN mkdir -p /home/user/gradio_cache 
+ENV GRADIO_TEMP_DIR=/home/user/gradio_cache
+
 WORKDIR /home/user/edgecraftrag
 RUN pip install --no-cache-dir -r requirements.txt
 
diff --git a/EdgeCraftRAG/README.md b/EdgeCraftRAG/README.md
index da8d2efb0..a24822532 100644
--- a/EdgeCraftRAG/README.md
+++ b/EdgeCraftRAG/README.md
@@ -7,39 +7,112 @@ quality and performance.
 
 ## Quick Start Guide
 
-### Run Containers with Docker Compose
+### (Optional) Build Docker Images for Mega Service, Server and UI by your own
+
+If you want to build the images by your own, please follow the steps:
+
+```bash
+cd GenAIExamples/EdgeCraftRAG
+
+docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy --build-arg no_proxy=$no_proxy -t opea/edgecraftrag:latest -f Dockerfile .
+docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy --build-arg no_proxy=$no_proxy -t opea/edgecraftrag-server:latest -f Dockerfile.server .
+docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy --build-arg no_proxy=$no_proxy -t opea/edgecraftrag-ui:latest -f ui/docker/Dockerfile.ui .
+```
+
+### Using Intel Arc GPU
+
+#### Local inference with OpenVINO for Intel Arc GPU
+
+You can select "local" type in generation field which is the default approach to enable Intel Arc GPU for LLM. You don't need to build images for "local" type.
+
+#### vLLM with OpenVINO for Intel Arc GPU
+
+You can also select "vLLM" as generation type, to enable this type, you'll need to build the vLLM image for Intel Arc GPU before service bootstrap.
+Please follow this link [vLLM with OpenVINO](https://github.com/opea-project/GenAIComps/tree/main/comps/llms/text-generation/vllm/langchain#build-docker-image) to build the vLLM image.
+
+### Start Edge Craft RAG Services with Docker Compose
+
+If you want to enable vLLM with OpenVINO service, please finish the steps in [Launch vLLM with OpenVINO service](#optional-launch-vllm-with-openvino-service) first.
 
 ```bash
 cd GenAIExamples/EdgeCraftRAG/docker_compose/intel/gpu/arc
 
 export MODEL_PATH="your model path for all your models"
 export DOC_PATH="your doc path for uploading a dir of files"
+export GRADIO_PATH="your gradio cache path for transferring files"
+
+# Make sure all 3 folders have 1000:1000 permission, otherwise
+# chown 1000:1000 ${MODEL_PATH} ${DOC_PATH} ${GRADIO_PATH}
+
+# Use `ip a` to check your active ip
 export HOST_IP="your host ip"
-export UI_SERVICE_PORT="port for UI service"
 
-# Optional for vllm endpoint
-export vLLM_ENDPOINT="http://${HOST_IP}:8008"
+# Check group id of video and render
+export VIDEOGROUPID=$(getent group video | cut -d: -f3)
+export RENDERGROUPID=$(getent group render | cut -d: -f3)
 
 # If you have a proxy configured, uncomment below line
-# export no_proxy=$no_proxy,${HOST_IP},edgecraftrag,edgecraftrag-server
+# export no_proxy=${no_proxy},${HOST_IP},edgecraftrag,edgecraftrag-server
+# export NO_PROXY=${NO_PROXY},${HOST_IP},edgecraftrag,edgecraftrag-server
 # If you have a HF mirror configured, it will be imported to the container
 # export HF_ENDPOINT="your HF mirror endpoint"
 
 # By default, the ports of the containers are set, uncomment if you want to change
 # export MEGA_SERVICE_PORT=16011
 # export PIPELINE_SERVICE_PORT=16011
+# export UI_SERVICE_PORT="8082"
+
+# Prepare models for embedding, reranking and generation, you can also choose other OpenVINO optimized models
+# Here is the example:
+pip install --upgrade --upgrade-strategy eager "optimum[openvino]"
+
+optimum-cli export openvino -m BAAI/bge-small-en-v1.5 ${MODEL_PATH}/BAAI/bge-small-en-v1.5 --task sentence-similarity
+optimum-cli export openvino -m BAAI/bge-reranker-large ${MODEL_PATH}/BAAI/bge-reranker-large --task sentence-similarity
+optimum-cli export openvino -m Qwen/Qwen2-7B-Instruct ${MODEL_PATH}/Qwen/Qwen2-7B-Instruct/INT4_compressed_weights --weight-format int4
 
 docker compose up -d
+
 ```
 
-### (Optional) Build Docker Images for Mega Service, Server and UI by your own
+#### (Optional) Launch vLLM with OpenVINO service
+
+1. Set up Environment Variables
 
 ```bash
-cd GenAIExamples/EdgeCraftRAG
+export LLM_MODEL=#your model id
+export VLLM_SERVICE_PORT=8008
+export vLLM_ENDPOINT="http://${HOST_IP}:${VLLM_SERVICE_PORT}"
+export HUGGINGFACEHUB_API_TOKEN=#your HF token
+```
 
-docker build --build-arg http_proxy=$HTTP_PROXY --build-arg https_proxy=$HTTPS_PROXY --build-arg no_proxy=$NO_PROXY -t opea/edgecraftrag:latest -f Dockerfile .
-docker build --build-arg http_proxy=$HTTP_PROXY --build-arg https_proxy=$HTTPS_PROXY --build-arg no_proxy=$NO_PROXY -t opea/edgecraftrag-server:latest -f Dockerfile.server .
-docker build --build-arg http_proxy=$HTTP_PROXY --build-arg https_proxy=$HTTPS_PROXY --build-arg no_proxy=$NO_PROXY -t opea/edgecraftrag-ui:latest -f ui/docker/Dockerfile.ui .
+2. Uncomment below code in 'GenAIExamples/EdgeCraftRAG/docker_compose/intel/gpu/arc/compose.yaml'
+
+```bash
+  # vllm-openvino-server:
+  #   container_name: vllm-openvino-server
+  #   image: opea/vllm-arc:latest
+  #   ports:
+  #     - ${VLLM_SERVICE_PORT:-8008}:80
+  #   environment:
+  #     HTTPS_PROXY: ${https_proxy}
+  #     HTTP_PROXY: ${https_proxy}
+  #     VLLM_OPENVINO_DEVICE: GPU
+  #     HF_ENDPOINT: ${HF_ENDPOINT}
+  #     HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+  #   volumes:
+  #     - /dev/dri/by-path:/dev/dri/by-path
+  #     - $HOME/.cache/huggingface:/root/.cache/huggingface
+  #   devices:
+  #     - /dev/dri
+  #   entrypoint: /bin/bash -c "\
+  #     cd / && \
+  #     export VLLM_CPU_KVCACHE_SPACE=50 && \
+  #     export VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON && \
+  #     python3 -m vllm.entrypoints.openai.api_server \
+  #       --model '${LLM_MODEL}' \
+  #       --max_model_len=1024 \
+  #       --host 0.0.0.0 \
+  #       --port 80"
 ```
 
 ### ChatQnA with LLM Example (Command Line)
@@ -109,7 +182,7 @@ curl -X POST http://${HOST_IP}:16010/v1/settings/pipelines -H "Content-Type: app
 # }
 
 # Prepare data from local directory
-curl -X POST http://${HOST_IP}:16010/v1/data -H "Content-Type: application/json" -d '{"local_path":"#REPLACE WITH YOUR LOCAL DOC DIR#"}' | jq '.'
+curl -X POST http://${HOST_IP}:16010/v1/data -H "Content-Type: application/json" -d '{"local_path":"docs/#REPLACE WITH YOUR DIR WITHIN MOUNTED DOC PATH#"}' | jq '.'
 
 # Validate Mega Service
 curl -X POST http://${HOST_IP}:16011/v1/chatqna -H "Content-Type: application/json" -d '{"messages":"#REPLACE WITH YOUR QUESTION HERE#", "top_n":5, "max_tokens":512}' | jq '.'
@@ -121,33 +194,14 @@ Open your browser, access http://${HOST_IP}:8082
 
 > Your browser should be running on the same host of your console, otherwise you will need to access UI with your host domain name instead of ${HOST_IP}.
 
-### (Optional) Launch vLLM with OpenVINO service
+To create a default pipeline, you need to click the `Create Pipeline` button on the `RAG Settings` page. You can also create multiple pipelines or update existing pipelines through the `Pipeline Configuration`, but please note that active pipelines cannot be updated.
+![create_pipeline](assets/img/create_pipeline.png)
 
-```bash
-# 1. export LLM_MODEL
-export LLM_MODEL="your model id"
-# 2. Uncomment below code in 'GenAIExamples/EdgeCraftRAG/docker_compose/intel/gpu/arc/compose.yaml'
-  # vllm-service:
-  #   image: vllm:openvino
-  #   container_name: vllm-openvino-server
-  #   depends_on:
-  #     - vllm-service
-  #   ports:
-  #     - "8008:80"
-  #   environment:
-  #     no_proxy: ${no_proxy}
-  #     http_proxy: ${http_proxy}
-  #     https_proxy: ${https_proxy}
-  #     vLLM_ENDPOINT: ${vLLM_ENDPOINT}
-  #     LLM_MODEL: ${LLM_MODEL}
-  #   entrypoint: /bin/bash -c "\
-  #     cd / && \
-  #     export VLLM_CPU_KVCACHE_SPACE=50 && \
-  #     python3 -m vllm.entrypoints.openai.api_server \
-  #       --model '${LLM_MODEL}' \
-  #       --host 0.0.0.0 \
-  #       --port 80"
-```
+After the pipeline creation, you can upload your data in the `Chatbot` page.
+![upload_data](assets/img/upload_data.png)
+
+Then, you can submit messages in the chat box.
+![chat_with_rag](assets/img/chat_with_rag.png)
 
 ## Advanced User Guide
 
@@ -156,27 +210,13 @@ export LLM_MODEL="your model id"
 #### Create a pipeline
 
 ```bash
-curl -X POST http://${HOST_IP}:16010/v1/settings/pipelines -H "Content-Type: application/json" -d @examples/test_pipeline.json | jq '.'
-```
-
-It will take some time to prepare the embedding model.
-
-#### Upload a text
-
-```bash
-curl -X POST http://${HOST_IP}:16010/v1/data -H "Content-Type: application/json" -d @examples/test_data.json | jq '.'
-```
-
-#### Provide a query to retrieve context with similarity search.
-
-```bash
-curl -X POST http://${HOST_IP}:16010/v1/retrieval -H "Content-Type: application/json" -d @examples/test_query.json | jq '.'
+curl -X POST http://${HOST_IP}:16010/v1/settings/pipelines -H "Content-Type: application/json" -d @tests/test_pipeline_local_llm.json | jq '.'
 ```
 
-#### Create the second pipeline test2
+#### Update a pipeline
 
 ```bash
-curl -X POST http://${HOST_IP}:16010/v1/settings/pipelines -H "Content-Type: application/json" -d @examples/test_pipeline2.json | jq '.'
+curl -X PATCH http://${HOST_IP}:16010/v1/settings/pipelines -H "Content-Type: application/json" -d @tests/test_pipeline_local_llm.json | jq '.'
 ```
 
 #### Check all pipelines
@@ -185,19 +225,10 @@ curl -X POST http://${HOST_IP}:16010/v1/settings/pipelines -H "Content-Type: app
 curl -X GET http://${HOST_IP}:16010/v1/settings/pipelines -H "Content-Type: application/json" | jq '.'
 ```
 
-#### Compare similarity retrieval (test1) and keyword retrieval (test2)
+#### Activate a pipeline
 
 ```bash
-# Activate pipeline test1
 curl -X PATCH http://${HOST_IP}:16010/v1/settings/pipelines/test1 -H "Content-Type: application/json" -d '{"active": "true"}' | jq '.'
-# Similarity retrieval
-curl -X POST http://${HOST_IP}:16010/v1/retrieval -H "Content-Type: application/json" -d '{"messages":"number"}' | jq '.'
-
-# Activate pipeline test2
-curl -X PATCH http://${HOST_IP}:16010/v1/settings/pipelines/test2 -H "Content-Type: application/json" -d '{"active": "true"}' | jq '.'
-# Keyword retrieval
-curl -X POST http://${HOST_IP}:16010/v1/retrieval -H "Content-Type: application/json" -d '{"messages":"number"}' | jq '.'
-
 ```
 
 ### Model Management
@@ -205,7 +236,7 @@ curl -X POST http://${HOST_IP}:16010/v1/retrieval -H "Content-Type: application/
 #### Load a model
 
 ```bash
-curl -X POST http://${HOST_IP}:16010/v1/settings/models -H "Content-Type: application/json" -d @examples/test_model_load.json | jq '.'
+curl -X POST http://${HOST_IP}:16010/v1/settings/models -H "Content-Type: application/json" -d '{"model_type": "reranker", "model_id": "BAAI/bge-reranker-large", "model_path": "./models/bge_ov_reranker", "device": "cpu"}' | jq '.'
 ```
 
 It will take some time to load the model.
@@ -219,7 +250,7 @@ curl -X GET http://${HOST_IP}:16010/v1/settings/models -H "Content-Type: applica
 #### Update a model
 
 ```bash
-curl -X PATCH http://${HOST_IP}:16010/v1/settings/models/BAAI/bge-reranker-large -H "Content-Type: application/json" -d @examples/test_model_update.json | jq '.'
+curl -X PATCH http://${HOST_IP}:16010/v1/settings/models/BAAI/bge-reranker-large -H "Content-Type: application/json" -d '{"model_type": "reranker", "model_id": "BAAI/bge-reranker-large", "model_path": "./models/bge_ov_reranker", "device": "gpu"}' | jq '.'
 ```
 
 #### Check a certain model
@@ -239,14 +270,14 @@ curl -X DELETE http://${HOST_IP}:16010/v1/settings/models/BAAI/bge-reranker-larg
 #### Add a text
 
 ```bash
-curl -X POST http://${HOST_IP}:16010/v1/data -H "Content-Type: application/json" -d @examples/test_data.json | jq '.'
+curl -X POST http://${HOST_IP}:16010/v1/data -H "Content-Type: application/json" -d '{"text":"#REPLACE WITH YOUR TEXT"}' | jq '.'
 ```
 
 #### Add files from existed file path
 
 ```bash
-curl -X POST http://${HOST_IP}:16010/v1/data -H "Content-Type: application/json" -d @examples/test_data_dir.json | jq '.'
-curl -X POST http://${HOST_IP}:16010/v1/data -H "Content-Type: application/json" -d @examples/test_data_file.json | jq '.'
+curl -X POST http://${HOST_IP}:16010/v1/data -H "Content-Type: application/json" -d '{"local_path":"docs/#REPLACE WITH YOUR DIR WITHIN MOUNTED DOC PATH#"}' | jq '.'
+curl -X POST http://${HOST_IP}:16010/v1/data -H "Content-Type: application/json" -d '{"local_path":"docs/#REPLACE WITH YOUR FILE WITHIN MOUNTED DOC PATH#"}' | jq '.'
 ```
 
 #### Check all files
@@ -270,5 +301,5 @@ curl -X DELETE http://${HOST_IP}:16010/v1/data/files/test2.docx -H "Content-Type
 #### Update a file
 
 ```bash
-curl -X PATCH http://${HOST_IP}:16010/v1/data/files/test.pdf -H "Content-Type: application/json" -d @examples/test_data_file.json | jq '.'
+curl -X PATCH http://${HOST_IP}:16010/v1/data/files/test.pdf -H "Content-Type: application/json" -d '{"local_path":"docs/#REPLACE WITH YOUR FILE WITHIN MOUNTED DOC PATH#"}' | jq '.'
 ```
diff --git a/EdgeCraftRAG/assets/img/chat_with_rag.png b/EdgeCraftRAG/assets/img/chat_with_rag.png
new file mode 100644
index 000000000..04000ef37
Binary files /dev/null and b/EdgeCraftRAG/assets/img/chat_with_rag.png differ
diff --git a/EdgeCraftRAG/assets/img/create_pipeline.png b/EdgeCraftRAG/assets/img/create_pipeline.png
new file mode 100644
index 000000000..53331b2b7
Binary files /dev/null and b/EdgeCraftRAG/assets/img/create_pipeline.png differ
diff --git a/EdgeCraftRAG/assets/img/upload_data.png b/EdgeCraftRAG/assets/img/upload_data.png
new file mode 100644
index 000000000..8fff43e68
Binary files /dev/null and b/EdgeCraftRAG/assets/img/upload_data.png differ
diff --git a/EdgeCraftRAG/chatqna.py b/EdgeCraftRAG/chatqna.py
index 1afa9621c..02f0a84dd 100644
--- a/EdgeCraftRAG/chatqna.py
+++ b/EdgeCraftRAG/chatqna.py
@@ -18,6 +18,7 @@
     ChatMessage,
     UsageInfo,
 )
+from comps.cores.proto.docarray import LLMParams
 from fastapi import Request
 from fastapi.responses import StreamingResponse
 
@@ -30,7 +31,20 @@ def __init__(self, megaservice, host="0.0.0.0", port=16011):
 
     async def handle_request(self, request: Request):
         input = await request.json()
-        result_dict, runtime_graph = await self.megaservice.schedule(initial_inputs=input)
+        stream_opt = input.get("stream", False)
+        chat_request = ChatCompletionRequest.parse_obj(input)
+        parameters = LLMParams(
+            max_tokens=chat_request.max_tokens if chat_request.max_tokens else 1024,
+            top_k=chat_request.top_k if chat_request.top_k else 10,
+            top_p=chat_request.top_p if chat_request.top_p else 0.95,
+            temperature=chat_request.temperature if chat_request.temperature else 0.01,
+            frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0,
+            presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0,
+            repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03,
+            streaming=stream_opt,
+            chat_template=chat_request.chat_template if chat_request.chat_template else None,
+        )
+        result_dict, runtime_graph = await self.megaservice.schedule(initial_inputs=input, llm_parameters=parameters)
         for node, response in result_dict.items():
             if isinstance(response, StreamingResponse):
                 return response
@@ -61,7 +75,7 @@ def add_remote_service(self):
             port=PIPELINE_SERVICE_PORT,
             endpoint="/v1/chatqna",
             use_remote_service=True,
-            service_type=ServiceType.UNDEFINED,
+            service_type=ServiceType.LLM,
         )
         self.megaservice.add(edgecraftrag)
         self.gateway = EdgeCraftRagGateway(megaservice=self.megaservice, host="0.0.0.0", port=self.port)
diff --git a/EdgeCraftRAG/docker_compose/intel/gpu/arc/compose.yaml b/EdgeCraftRAG/docker_compose/intel/gpu/arc/compose.yaml
index f877b7c58..a695fbc02 100644
--- a/EdgeCraftRAG/docker_compose/intel/gpu/arc/compose.yaml
+++ b/EdgeCraftRAG/docker_compose/intel/gpu/arc/compose.yaml
@@ -14,12 +14,15 @@ services:
     volumes:
       - ${MODEL_PATH:-${PWD}}:/home/user/models
       - ${DOC_PATH:-${PWD}}:/home/user/docs
+      - ${GRADIO_PATH:-${PWD}}:/home/user/gradio_cache
+      - ${HF_CACHE:-${HOME}/.cache}:/home/user/.cache
     ports:
       - ${PIPELINE_SERVICE_PORT:-16010}:${PIPELINE_SERVICE_PORT:-16010}
     devices:
       - /dev/dri:/dev/dri
     group_add:
-      - video
+      - ${VIDEOGROUPID:-44}
+      - ${RENDERGROUPID:-109}
   ecrag:
     image: ${REGISTRY:-opea}/edgecraftrag:${TAG:-latest}
     container_name: edgecraftrag
@@ -48,31 +51,42 @@ services:
       PIPELINE_SERVICE_HOST_IP: ${PIPELINE_SERVICE_HOST_IP:-${HOST_IP}}
       UI_SERVICE_PORT: ${UI_SERVICE_PORT:-8082}
       UI_SERVICE_HOST_IP: ${UI_SERVICE_HOST_IP:-0.0.0.0}
+    volumes:
+      - ${GRADIO_PATH:-${PWD}}:/home/user/gradio_cache
     ports:
-        - ${UI_SERVICE_PORT:-8082}:${UI_SERVICE_PORT:-8082}
+      - ${UI_SERVICE_PORT:-8082}:${UI_SERVICE_PORT:-8082}
     restart: always
     depends_on:
       - server
       - ecrag
-  # vllm-service:
-  #   image: vllm:openvino
+  # vllm-openvino-server:
   #   container_name: vllm-openvino-server
+  #   image: opea/vllm-arc:latest
   #   ports:
-  #     - "8008:80"
+  #     - ${VLLM_SERVICE_PORT:-8008}:80
   #   environment:
-  #     no_proxy: ${no_proxy}
-  #     http_proxy: ${http_proxy}
-  #     https_proxy: ${https_proxy}
-  #     vLLM_ENDPOINT: ${vLLM_ENDPOINT}
-  #     LLM_MODEL: ${LLM_MODEL}
+  #     HTTPS_PROXY: ${https_proxy}
+  #     HTTP_PROXY: ${https_proxy}
+  #     VLLM_OPENVINO_DEVICE: GPU
+  #     HF_ENDPOINT: ${HF_ENDPOINT}
+  #     HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+  #   volumes:
+  #     - /dev/dri/by-path:/dev/dri/by-path
+  #     - $HOME/.cache/huggingface:/root/.cache/huggingface
+  #   devices:
+  #     - /dev/dri
+  #   group_add:
+  #     - ${VIDEOGROUPID:-44}
+  #     - ${RENDERGROUPID:-109}
   #   entrypoint: /bin/bash -c "\
   #     cd / && \
   #     export VLLM_CPU_KVCACHE_SPACE=50 && \
+  #     export VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON && \
   #     python3 -m vllm.entrypoints.openai.api_server \
   #       --model '${LLM_MODEL}' \
+  #       --max_model_len=1024 \
   #       --host 0.0.0.0 \
   #       --port 80"
-
 networks:
   default:
     driver: bridge
diff --git a/EdgeCraftRAG/edgecraftrag/api/v1/chatqna.py b/EdgeCraftRAG/edgecraftrag/api/v1/chatqna.py
index dfd32c29e..8249950d0 100644
--- a/EdgeCraftRAG/edgecraftrag/api/v1/chatqna.py
+++ b/EdgeCraftRAG/edgecraftrag/api/v1/chatqna.py
@@ -25,5 +25,8 @@ async def retrieval(request: ChatCompletionRequest):
 # ChatQnA
 @chatqna_app.post(path="/v1/chatqna")
 async def chatqna(request: ChatCompletionRequest):
-    ret = ctx.get_pipeline_mgr().run_pipeline(chat_request=request)
-    return str(ret)
+    if request.stream:
+        return ctx.get_pipeline_mgr().run_pipeline(chat_request=request)
+    else:
+        ret = ctx.get_pipeline_mgr().run_pipeline(chat_request=request)
+        return str(ret)
diff --git a/EdgeCraftRAG/edgecraftrag/api/v1/pipeline.py b/EdgeCraftRAG/edgecraftrag/api/v1/pipeline.py
index 9d008e82f..f58390cfd 100644
--- a/EdgeCraftRAG/edgecraftrag/api/v1/pipeline.py
+++ b/EdgeCraftRAG/edgecraftrag/api/v1/pipeline.py
@@ -157,16 +157,13 @@ def update_pipeline_handler(pl, req):
         gen = req.generator
         if gen.model is None:
             return "No ChatQnA Model"
-        if gen.inference_type == InferenceType.VLLM:
-            if gen.model.model_id:
-                model_ref = gen.model.model_id
-            else:
-                model_ref = gen.model.model_path
-            pl.generator = QnAGenerator(model_ref, gen.prompt_path, gen.inference_type)
-        elif gen.inference_type == InferenceType.LOCAL:
+        if gen.inference_type:
             model = ctx.get_model_mgr().search_model(gen.model)
             if model is None:
-                gen.model.model_type = ModelType.LLM
+                if gen.inference_type == InferenceType.VLLM:
+                    gen.model.model_type = ModelType.VLLM
+                else:
+                    gen.model.model_type = ModelType.LLM
                 model = ctx.get_model_mgr().load_model(gen.model)
                 ctx.get_model_mgr().add(model)
             # Use weakref to achieve model deletion and memory release
diff --git a/EdgeCraftRAG/edgecraftrag/api_schema.py b/EdgeCraftRAG/edgecraftrag/api_schema.py
index 1f124a7f9..5927e0304 100644
--- a/EdgeCraftRAG/edgecraftrag/api_schema.py
+++ b/EdgeCraftRAG/edgecraftrag/api_schema.py
@@ -10,6 +10,7 @@ class ModelIn(BaseModel):
     model_type: Optional[str] = "LLM"
     model_id: Optional[str]
     model_path: Optional[str] = "./"
+    weight: Optional[str]
     device: Optional[str] = "cpu"
 
 
diff --git a/EdgeCraftRAG/edgecraftrag/base.py b/EdgeCraftRAG/edgecraftrag/base.py
index d8c7aaef8..a163c486f 100644
--- a/EdgeCraftRAG/edgecraftrag/base.py
+++ b/EdgeCraftRAG/edgecraftrag/base.py
@@ -27,6 +27,7 @@ class ModelType(str, Enum):
     EMBEDDING = "embedding"
     RERANKER = "reranker"
     LLM = "llm"
+    VLLM = "vllm"
 
 
 class FileType(str, Enum):
diff --git a/EdgeCraftRAG/edgecraftrag/components/generator.py b/EdgeCraftRAG/edgecraftrag/components/generator.py
index cbfd6686d..a888bf18f 100644
--- a/EdgeCraftRAG/edgecraftrag/components/generator.py
+++ b/EdgeCraftRAG/edgecraftrag/components/generator.py
@@ -1,10 +1,11 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
+import asyncio
 import dataclasses
 import os
 
-from comps import GeneratedDoc, opea_telemetry
+from comps import GeneratedDoc
 from edgecraftrag.base import BaseComponent, CompType, GeneratorType
 from fastapi.responses import StreamingResponse
 from langchain_core.prompts import PromptTemplate
@@ -12,18 +13,6 @@
 from pydantic import model_serializer
 
 
-@opea_telemetry
-def post_process_text(text: str):
-    if text == " ":
-        return "data: @#$\n\n"
-    if text == "\n":
-        return "data: <br/>\n\n"
-    if text.isspace():
-        return None
-    new_text = text.replace(" ", "@#$")
-    return f"data: {new_text}\n\n"
-
-
 class QnAGenerator(BaseComponent):
 
     def __init__(self, llm_model, prompt_template, inference_type, **kwargs):
@@ -76,8 +65,18 @@ def run(self, chat_request, retrieved_nodes, **kwargs):
             repetition_penalty=chat_request.repetition_penalty,
         )
         self.llm().generate_kwargs = generate_kwargs
+        if chat_request.stream:
+
+            async def stream_generator():
+                response = self.llm().stream_complete(prompt_str)
+                for r in response:
+                    yield r.delta
+                    # Simulate asynchronous operation
+                    await asyncio.sleep(0.01)
 
-        return self.llm().complete(prompt_str)
+            return StreamingResponse(stream_generator(), media_type="text/event-stream")
+        else:
+            return self.llm().complete(prompt_str)
 
     def run_vllm(self, chat_request, retrieved_nodes, **kwargs):
         if self.llm is None:
@@ -92,7 +91,7 @@ def run_vllm(self, chat_request, retrieved_nodes, **kwargs):
         prompt_str = self.prompt.format(input=query, context=text_gen_context)
 
         llm_endpoint = os.getenv("vLLM_ENDPOINT", "http://localhost:8008")
-        model_name = self.llm
+        model_name = self.llm().model_id
         llm = OpenAILike(
             api_key="fake",
             api_base=llm_endpoint + "/v1",
@@ -106,12 +105,10 @@ def run_vllm(self, chat_request, retrieved_nodes, **kwargs):
         if chat_request.stream:
 
             async def stream_generator():
-                response = await llm.astream_complete(prompt_str)
-                async for text in response:
-                    output = text.text
-                    yield f"data: {output}\n\n"
-
-                yield "data: [DONE]\n\n"
+                response = llm.stream_complete(prompt_str)
+                for text in response:
+                    yield text.delta
+                    await asyncio.sleep(0.01)
 
             return StreamingResponse(stream_generator(), media_type="text/event-stream")
         else:
@@ -122,7 +119,12 @@ async def stream_generator():
 
     @model_serializer
     def ser_model(self):
-        set = {"idx": self.idx, "generator_type": self.comp_subtype, "model": self.model_id}
+        set = {
+            "idx": self.idx,
+            "generator_type": self.comp_subtype,
+            "inference_type": self.inference_type,
+            "model": self.llm(),
+        }
         return set
 
 
diff --git a/EdgeCraftRAG/edgecraftrag/components/model.py b/EdgeCraftRAG/edgecraftrag/components/model.py
index 72ee7f16e..75fa69c41 100644
--- a/EdgeCraftRAG/edgecraftrag/components/model.py
+++ b/EdgeCraftRAG/edgecraftrag/components/model.py
@@ -14,6 +14,7 @@ class BaseModelComponent(BaseComponent):
 
     model_id: Optional[str] = Field(default="")
     model_path: Optional[str] = Field(default="")
+    weight: Optional[str] = Field(default="")
     device: Optional[str] = Field(default="cpu")
 
     def run(self, **kwargs) -> Any:
@@ -26,6 +27,7 @@ def ser_model(self):
             "type": self.comp_subtype,
             "model_id": self.model_id,
             "model_path": self.model_path,
+            "weight": self.weight,
             "device": self.device,
         }
         return set
@@ -33,7 +35,7 @@ def ser_model(self):
 
 class OpenVINOEmbeddingModel(BaseModelComponent, OpenVINOEmbedding):
 
-    def __init__(self, model_id, model_path, device):
+    def __init__(self, model_id, model_path, device, weight):
         OpenVINOEmbedding.create_and_save_openvino_model(model_id, model_path)
         OpenVINOEmbedding.__init__(self, model_id_or_path=model_path, device=device)
         self.comp_type = CompType.MODEL
@@ -41,11 +43,12 @@ def __init__(self, model_id, model_path, device):
         self.model_id = model_id
         self.model_path = model_path
         self.device = device
+        self.weight = ""
 
 
 class OpenVINORerankModel(BaseModelComponent, OpenVINORerank):
 
-    def __init__(self, model_id, model_path, device):
+    def __init__(self, model_id, model_path, device, weight):
         OpenVINORerank.create_and_save_openvino_model(model_id, model_path)
         OpenVINORerank.__init__(
             self,
@@ -57,11 +60,12 @@ def __init__(self, model_id, model_path, device):
         self.model_id = model_id
         self.model_path = model_path
         self.device = device
+        self.weight = ""
 
 
 class OpenVINOLLMModel(BaseModelComponent, OpenVINOLLM):
 
-    def __init__(self, model_id, model_path, device):
+    def __init__(self, model_id, model_path, device, weight):
         OpenVINOLLM.__init__(
             self,
             model_id_or_path=model_path,
@@ -72,3 +76,4 @@ def __init__(self, model_id, model_path, device):
         self.model_id = model_id
         self.model_path = model_path
         self.device = device
+        self.weight = weight
diff --git a/EdgeCraftRAG/edgecraftrag/components/pipeline.py b/EdgeCraftRAG/edgecraftrag/components/pipeline.py
index 4a2932e00..5af8b5cbe 100644
--- a/EdgeCraftRAG/edgecraftrag/components/pipeline.py
+++ b/EdgeCraftRAG/edgecraftrag/components/pipeline.py
@@ -110,8 +110,10 @@ def model_existed(self, model_id: str) -> bool:
                     return True
         if self.generator:
             llm = self.generator.llm
-            if llm() and llm().model_id == model_id:
-                return True
+            if isinstance(llm, str):
+                return llm == model_id
+            else:
+                return llm().model_id == model_id
         return False
 
 
@@ -154,7 +156,8 @@ def run_test_generator(pl: Pipeline, chat_request: ChatCompletionRequest) -> Any
     if pl.generator is None:
         return "No Generator Specified"
     if pl.generator.inference_type == InferenceType.LOCAL:
-        answer = pl.generator.run(chat_request, retri_res)
+        return pl.generator.run(chat_request, retri_res)
     elif pl.generator.inference_type == InferenceType.VLLM:
-        answer = pl.generator.run_vllm(chat_request, retri_res)
-    return answer
+        return pl.generator.run_vllm(chat_request, retri_res)
+    else:
+        return "LLM inference_type not supported"
diff --git a/EdgeCraftRAG/edgecraftrag/controllers/modelmgr.py b/EdgeCraftRAG/edgecraftrag/controllers/modelmgr.py
index 73a77e48a..6d0166bc5 100644
--- a/EdgeCraftRAG/edgecraftrag/controllers/modelmgr.py
+++ b/EdgeCraftRAG/edgecraftrag/controllers/modelmgr.py
@@ -3,9 +3,14 @@
 
 import asyncio
 
-from edgecraftrag.api_schema import IndexerIn, ModelIn, NodeParserIn
-from edgecraftrag.base import BaseComponent, BaseMgr, CallbackType, ModelType
-from edgecraftrag.components.model import OpenVINOEmbeddingModel, OpenVINOLLMModel, OpenVINORerankModel
+from edgecraftrag.api_schema import ModelIn
+from edgecraftrag.base import BaseComponent, BaseMgr, CompType, ModelType
+from edgecraftrag.components.model import (
+    BaseModelComponent,
+    OpenVINOEmbeddingModel,
+    OpenVINOLLMModel,
+    OpenVINORerankModel,
+)
 
 
 class ModelMgr(BaseMgr):
@@ -78,17 +83,25 @@ def load_model(model_para: ModelIn):
                     model_id=model_para.model_id,
                     model_path=model_para.model_path,
                     device=model_para.device,
+                    weight=model_para.weight,
                 )
             case ModelType.RERANKER:
                 model = OpenVINORerankModel(
                     model_id=model_para.model_id,
                     model_path=model_para.model_path,
                     device=model_para.device,
+                    weight=model_para.weight,
                 )
             case ModelType.LLM:
                 model = OpenVINOLLMModel(
                     model_id=model_para.model_id,
                     model_path=model_para.model_path,
                     device=model_para.device,
+                    weight=model_para.weight,
                 )
+            case ModelType.VLLM:
+                model = BaseModelComponent(model_id=model_para.model_id, model_path="", device="", weight="")
+                model.comp_type = CompType.MODEL
+                model.comp_subtype = ModelType.VLLM
+                model.model_id_or_path = model_para.model_id
         return model
diff --git a/EdgeCraftRAG/edgecraftrag/prompt_template/default_prompt.txt b/EdgeCraftRAG/edgecraftrag/prompt_template/default_prompt.txt
index 800d1fa2f..aa57e6059 100644
--- a/EdgeCraftRAG/edgecraftrag/prompt_template/default_prompt.txt
+++ b/EdgeCraftRAG/edgecraftrag/prompt_template/default_prompt.txt
@@ -5,4 +5,4 @@
 <|im_start|>System: Pay attention to your formatting of response. If you need to reference content from context, try to keep the formatting.<|im_end|>
 <|im_start|>System: Try to summarize from the context, do some reasoning before response, then response. Make sure your response is logically sound and self-consistent.<|im_end|>
 
-<|im_start|>{input}
+<|im_start|>{input}
\ No newline at end of file
diff --git a/EdgeCraftRAG/edgecraftrag/requirements.txt b/EdgeCraftRAG/edgecraftrag/requirements.txt
index 3756c732a..6757aa752 100644
--- a/EdgeCraftRAG/edgecraftrag/requirements.txt
+++ b/EdgeCraftRAG/edgecraftrag/requirements.txt
@@ -1,6 +1,5 @@
 docx2txt
 faiss-cpu>=1.8.0.post1
-gradio>=4.44.1
 langchain-core==0.2.29
 llama-index>=0.11.0
 llama-index-embeddings-openvino>=0.4.0
@@ -9,8 +8,4 @@ llama-index-llms-openvino>=0.3.1
 llama-index-postprocessor-openvino-rerank>=0.3.0
 llama-index-retrievers-bm25>=0.3.0
 llama-index-vector-stores-faiss>=0.2.1
-loguru>=0.7.2
-omegaconf>=2.3.0
 opea-comps>=0.9
-py-cpuinfo>=9.0.0
-uvicorn>=0.30.6
diff --git a/EdgeCraftRAG/requirements.txt b/EdgeCraftRAG/requirements.txt
new file mode 100644
index 000000000..5b27f1434
--- /dev/null
+++ b/EdgeCraftRAG/requirements.txt
@@ -0,0 +1,2 @@
+fastapi>=0.115.0
+opea-comps>=0.9
diff --git a/EdgeCraftRAG/tests/test_pipeline_local_llm.json b/EdgeCraftRAG/tests/test_pipeline_local_llm.json
index 18895d6e5..13485cebc 100644
--- a/EdgeCraftRAG/tests/test_pipeline_local_llm.json
+++ b/EdgeCraftRAG/tests/test_pipeline_local_llm.json
@@ -9,7 +9,6 @@
     "indexer_type": "faiss_vector",
     "embedding_model": {
       "model_id": "BAAI/bge-small-en-v1.5",
-      "model_path": "./models/bge_ov_embedding",
       "device": "auto"
     }
   },
@@ -23,7 +22,6 @@
       "top_n": 2,
       "reranker_model": {
         "model_id": "BAAI/bge-reranker-large",
-        "model_path": "./models/bge_ov_reranker",
         "device": "auto"
       }
     }
@@ -31,7 +29,6 @@
   "generator": {
     "model": {
       "model_id": "Qwen/Qwen2-7B-Instruct",
-      "model_path": "./models/qwen2-7b-instruct/INT4_compressed_weights",
       "device": "cpu"
     },
     "prompt_path": "./edgecraftrag/prompt_template/default_prompt.txt",
diff --git a/EdgeCraftRAG/ui/docker/Dockerfile.ui b/EdgeCraftRAG/ui/docker/Dockerfile.ui
index 46a14a6e9..3dacb35d8 100644
--- a/EdgeCraftRAG/ui/docker/Dockerfile.ui
+++ b/EdgeCraftRAG/ui/docker/Dockerfile.ui
@@ -11,10 +11,11 @@ RUN useradd -m -s /bin/bash user && \
 COPY ./ui/gradio /home/user/ui
 COPY ./edgecraftrag /home/user/edgecraftrag
 
-WORKDIR /home/user/edgecraftrag
-RUN pip install --no-cache-dir -r requirements.txt
+RUN mkdir -p /home/user/gradio_cache 
+ENV GRADIO_TEMP_DIR=/home/user/gradio_cache
 
 WORKDIR /home/user/ui
+RUN pip install --no-cache-dir -r requirements.txt
 
 USER user
 
diff --git a/EdgeCraftRAG/ui/gradio/default.yaml b/EdgeCraftRAG/ui/gradio/default.yaml
index 1421da8f4..39c3ee92e 100644
--- a/EdgeCraftRAG/ui/gradio/default.yaml
+++ b/EdgeCraftRAG/ui/gradio/default.yaml
@@ -3,7 +3,6 @@
 
 # Model language for LLM
 model_language: "Chinese"
-vector_db:  "FAISS"
 splitter_name: "RecursiveCharacter"
 k_rerank: 5
 search_method: "similarity"
@@ -29,21 +28,19 @@ k_retrieval: 30
 postprocessor: "reranker"
 
 # Generator
-generator: "local"
-prompt_path: "./data/default_prompt.txt"
+generator: "chatqna"
+prompt_path: "./edgecraftrag/prompt_template/default_prompt.txt"
 
 # Models
 embedding_model_id: "BAAI/bge-small-en-v1.5"
-embedding_model_path: "./bge_ov_embedding"
 # Device for embedding model inference
 embedding_device: "AUTO"
 
 rerank_model_id: "BAAI/bge-reranker-large"
-rerank_model_path: "./bge_ov_reranker"
 # Device for reranking model inference
 rerank_device: "AUTO"
 
-llm_model_id: "qwen2-7b-instruct"
-llm_model_path: "./qwen2-7b-instruct/INT4_compressed_weights"
+llm_model_id: "Qwen/Qwen2-7B-Instruct"
+llm_weights: "INT4"
 # Device for LLM model inference
 llm_device: "AUTO"
diff --git a/EdgeCraftRAG/ui/gradio/ecrag_client.py b/EdgeCraftRAG/ui/gradio/ecrag_client.py
index 47b5f776d..6593cbd94 100644
--- a/EdgeCraftRAG/ui/gradio/ecrag_client.py
+++ b/EdgeCraftRAG/ui/gradio/ecrag_client.py
@@ -1,13 +1,13 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
+import os
 import sys
 
+import platform_config as pconf
 import requests
 
 sys.path.append("..")
-import os
-
 from edgecraftrag import api_schema
 
 PIPELINE_SERVICE_HOST_IP = os.getenv("PIPELINE_SERVICE_HOST_IP", "127.0.0.1")
@@ -42,6 +42,7 @@ def create_update_pipeline(
     vector_search_top_k,
     postprocessor,
     generator,
+    llm_infertype,
     llm_id,
     llm_device,
     llm_weights,
@@ -50,6 +51,7 @@ def create_update_pipeline(
     rerank_id,
     rerank_device,
 ):
+    llm_path = pconf.get_llm_model_dir("./models/", llm_id, llm_weights)
     req_dict = api_schema.PipelineCreateIn(
         name=name,
         active=active,
@@ -60,9 +62,9 @@ def create_update_pipeline(
             indexer_type=indexer,
             embedding_model=api_schema.ModelIn(
                 model_id=embedding_id,
-                # TODO: remove hardcoding
-                model_path="./bge_ov_embedding",
+                model_path="./models/" + embedding_id,
                 device=embedding_device,
+                weight=llm_weights,
             ),
         ),
         retriever=api_schema.RetrieverIn(retriever_type=retriever, retriever_topk=vector_search_top_k),
@@ -70,22 +72,15 @@ def create_update_pipeline(
             api_schema.PostProcessorIn(
                 processor_type=postprocessor[0],
                 reranker_model=api_schema.ModelIn(
-                    model_id=rerank_id,
-                    # TODO: remove hardcoding
-                    model_path="./bge_ov_reranker",
-                    device=rerank_device,
+                    model_id=rerank_id, model_path="./models/" + rerank_id, device=rerank_device, weight=llm_weights
                 ),
             )
         ],
         generator=api_schema.GeneratorIn(
             # TODO: remove hardcoding
             prompt_path="./edgecraftrag/prompt_template/default_prompt.txt",
-            model=api_schema.ModelIn(
-                model_id=llm_id,
-                # TODO: remove hardcoding
-                model_path="./models/qwen2-7b-instruct/INT4_compressed_weights",
-                device=llm_device,
-            ),
+            model=api_schema.ModelIn(model_id=llm_id, model_path=llm_path, device=llm_device, weight=llm_weights),
+            inference_type=llm_infertype,
         ),
     )
     # hard code only for test
@@ -105,7 +100,7 @@ def activate_pipeline(name):
     return restext, status
 
 
-def create_vectordb(docs, spliter, vector_db):
+def create_vectordb(docs, spliter):
     req_dict = api_schema.FilesIn(local_paths=docs)
     res = requests.post(f"{server_addr}/v1/data/files", json=req_dict.dict(), proxies={"http": None})
     return res.text
@@ -116,6 +111,8 @@ def get_files():
     files = []
     for file in res.json():
         files.append((file["file_name"], file["file_id"]))
+    if not files:
+        files.append((None, None))
     return files
 
 
diff --git a/EdgeCraftRAG/ui/gradio/ecragui.py b/EdgeCraftRAG/ui/gradio/ecragui.py
index 3c198bf2a..23a5286de 100644
--- a/EdgeCraftRAG/ui/gradio/ecragui.py
+++ b/EdgeCraftRAG/ui/gradio/ecragui.py
@@ -2,11 +2,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import argparse
-import json
+import os
 import platform
-import re
 from datetime import datetime
-from pathlib import Path
 
 import cpuinfo
 import distro  # if running Python 3.8 or above
@@ -17,41 +15,22 @@
 # Creation of the ModelLoader instance and loading models remain the same
 import platform_config as pconf
 import psutil
-import requests
 from loguru import logger
 from omegaconf import OmegaConf
-from platform_config import get_available_devices, get_available_weights, get_local_available_models
+from platform_config import (
+    get_avail_llm_inference_type,
+    get_available_devices,
+    get_available_weights,
+    get_local_available_models,
+)
 
 pipeline_df = []
 
-import os
 
 MEGA_SERVICE_HOST_IP = os.getenv("MEGA_SERVICE_HOST_IP", "127.0.0.1")
 MEGA_SERVICE_PORT = int(os.getenv("MEGA_SERVICE_PORT", 16011))
 UI_SERVICE_HOST_IP = os.getenv("UI_SERVICE_HOST_IP", "0.0.0.0")
-UI_SERVICE_PORT = int(os.getenv("UI_SERVICE_PORT", 8084))
-
-
-def get_llm_model_dir(llm_model_id, weights_compression):
-    model_dirs = {
-        "fp16_model_dir": Path(llm_model_id) / "FP16",
-        "int8_model_dir": Path(llm_model_id) / "INT8_compressed_weights",
-        "int4_model_dir": Path(llm_model_id) / "INT4_compressed_weights",
-    }
-
-    if weights_compression == "INT4":
-        model_dir = model_dirs["int4_model_dir"]
-    elif weights_compression == "INT8":
-        model_dir = model_dirs["int8_model_dir"]
-    else:
-        model_dir = model_dirs["fp16_model_dir"]
-
-    if not model_dir.exists():
-        raise FileNotFoundError(f"The model directory {model_dir} does not exist.")
-    elif not model_dir.is_dir():
-        raise NotADirectoryError(f"The path {model_dir} is not a directory.")
-
-    return model_dir
+UI_SERVICE_PORT = int(os.getenv("UI_SERVICE_PORT", 8082))
 
 
 def get_system_status():
@@ -87,31 +66,7 @@ def get_system_status():
     return status
 
 
-def build_demo(cfg, args):
-
-    def load_chatbot_models(
-        llm_id,
-        llm_device,
-        llm_weights,
-        embedding_id,
-        embedding_device,
-        rerank_id,
-        rerank_device,
-    ):
-        req_dict = {
-            "llm_id": llm_id,
-            "llm_device": llm_device,
-            "llm_weights": llm_weights,
-            "embedding_id": embedding_id,
-            "embedding_device": embedding_device,
-            "rerank_id": rerank_id,
-            "rerank_device": rerank_device,
-        }
-        # hard code only for test
-        worker_addr = "http://127.0.0.1:8084"
-        print(req_dict)
-        result = requests.post(f"{worker_addr}/load", json=req_dict, proxies={"http": None})
-        return result.text
+def build_app(cfg, args):
 
     def user(message, history):
         """Callback function for updating user messages in interface on submit button click.
@@ -131,11 +86,9 @@ async def bot(
         top_p,
         top_k,
         repetition_penalty,
+        max_tokens,
         hide_full_prompt,
-        do_rag,
         docs,
-        spliter_name,
-        vector_db,
         chunk_size,
         chunk_overlap,
         vector_search_top_k,
@@ -155,41 +108,16 @@ async def bot(
         repetition_penalty: parameter for penalizing tokens based on how frequently they occur in the text.
         conversation_id: unique conversation identifier.
         """
-        # req_dict = {
-        #     "history": history,
-        #     "temperature": temperature,
-        #     "top_p": top_p,
-        #     "top_k": top_k,
-        #     "repetition_penalty": repetition_penalty,
-        #     "hide_full_prompt": hide_full_prompt,
-        #     "do_rag": do_rag,
-        #     "docs": docs,
-        #     "spliter_name": spliter_name,
-        #     "vector_db": vector_db,
-        #     "chunk_size": chunk_size,
-        #     "chunk_overlap": chunk_overlap,
-        #     "vector_search_top_k": vector_search_top_k,
-        #     "vector_search_top_n": vector_search_top_n,
-        #     "run_rerank": run_rerank,
-        #     "search_method": search_method,
-        #     "score_threshold": score_threshold,
-        #     "streaming": True
-        # }
-        print(history)
-        new_req = {"messages": history[-1][0]}
+        stream_opt = True
+        new_req = {"messages": history[-1][0], "stream": stream_opt, "max_tokens": max_tokens}
         server_addr = f"http://{MEGA_SERVICE_HOST_IP}:{MEGA_SERVICE_PORT}"
 
         # Async for streaming response
         partial_text = ""
         async with httpx.AsyncClient() as client:
             async with client.stream("POST", f"{server_addr}/v1/chatqna", json=new_req, timeout=None) as response:
-                partial_text = ""
-                async for chunk in response.aiter_lines():
-                    new_text = chunk
-                    if new_text.startswith("data"):
-                        new_text = re.sub(r"\r\n", "", chunk.split("data: ")[-1])
-                    new_text = json.loads(chunk)["choices"][0]["message"]["content"]
-                    partial_text = partial_text + new_text
+                async for chunk in response.aiter_text():
+                    partial_text = partial_text + chunk
                     history[-1][1] = partial_text
                     yield history
 
@@ -198,6 +126,7 @@ async def bot(
     avail_rerank_models = get_local_available_models("rerank")
     avail_devices = get_available_devices()
     avail_weights_compression = get_available_weights()
+    avail_llm_inference_type = get_avail_llm_inference_type()
     avail_node_parsers = pconf.get_available_node_parsers()
     avail_indexers = pconf.get_available_indexers()
     avail_retrievers = pconf.get_available_retrievers()
@@ -212,7 +141,7 @@ async def bot(
     .disclaimer {font-variant-caps: all-small-caps}
     """
 
-    with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
+    with gr.Blocks(theme=gr.themes.Soft(), css=css) as app:
         gr.HTML(
             """
             <!DOCTYPE html>
@@ -250,7 +179,7 @@ async def bot(
                 <!-- Title container centered in the remaining space -->
                 <div class="title-container">
                     <span class="title-line"><h1 >Edge Craft RAG based Q&A Chatbot</h1></span>
-                    <span class="title-line"><h5 style="margin: 0;">Powered by Intel NEXC Edge AI solutions</h5></span>
+                    <span class="title-line"><h5 style="margin: 0;">Powered by Intel</h5></span>
                 </div>
             </div>
 
@@ -295,7 +224,6 @@ def get_pipeline_df():
                         with gr.Row():
                             rag_create_pipeline = gr.Button("Create Pipeline")
                             rag_activate_pipeline = gr.Button("Activate Pipeline")
-                            rag_remove_pipeline = gr.Button("Remove Pipeline")
 
                         with gr.Column(variant="panel"):
                             u_pipeline_name = gr.Textbox(
@@ -366,6 +294,7 @@ def get_pipeline_df():
                                         label="Embedding run device",
                                         # info="Run embedding model on which device?",
                                         multiselect=False,
+                                        interactive=True,
                                     )
 
                         with gr.Column(variant="panel"):
@@ -415,6 +344,7 @@ def get_pipeline_df():
                                         label="Rerank run device",
                                         # info="Run rerank model on which device?",
                                         multiselect=False,
+                                        interactive=True,
                                     )
 
                         with gr.Column(variant="panel"):
@@ -428,6 +358,10 @@ def get_pipeline_df():
                                     interactive=True,
                                 )
 
+                                u_llm_infertype = gr.Radio(
+                                    choices=avail_llm_inference_type, label="LLM Inference Type", value="local"
+                                )
+
                                 with gr.Accordion("LLM Configuration", open=True):
                                     u_llm_model_id = gr.Dropdown(
                                         choices=avail_llms,
@@ -444,12 +378,15 @@ def get_pipeline_df():
                                         label="LLM run device",
                                         # info="Run LLM on which device?",
                                         multiselect=False,
+                                        interactive=True,
                                     )
 
                                     u_llm_weights = gr.Radio(
                                         avail_weights_compression,
                                         label="Weights",
                                         info="weights compression",
+                                        value=cfg.llm_weights,
+                                        interactive=True,
                                     )
 
         # -------------------
@@ -460,14 +397,9 @@ def show_pipeline_detail(evt: gr.SelectData):
             # get selected pipeline id
             # Dataframe: {'headers': '', 'data': [[x00, x01], [x10, x11]}
             # SelectData.index: [i, j]
-            print(u_pipelines.value["data"])
-            print(evt.index)
             # always use pipeline id for indexing
             selected_id = pipeline_df[evt.index[0]][0]
             pl = cli.get_pipeline(selected_id)
-            # TODO: change to json fomart
-            # pl["postprocessor"][0]["processor_type"]
-            # pl["postprocessor"]["model"]["model_id"], pl["postprocessor"]["model"]["device"]
             return (
                 pl["name"],
                 pl["status"]["active"],
@@ -477,12 +409,16 @@ def show_pipeline_detail(evt: gr.SelectData):
                 pl["indexer"]["indexer_type"],
                 pl["retriever"]["retriever_type"],
                 pl["retriever"]["retrieve_topk"],
+                pl["postprocessor"][0]["postprocessor_type"],
                 pl["generator"]["generator_type"],
+                pl["generator"]["inference_type"],
                 pl["generator"]["model"]["model_id"],
                 pl["generator"]["model"]["device"],
-                "",
+                pl["generator"]["model"]["weight"],
                 pl["indexer"]["model"]["model_id"],
                 pl["indexer"]["model"]["device"],
+                pl["postprocessor"][0]["model"]["model_id"] if pl["postprocessor"][0]["model"] is not None else "",
+                pl["postprocessor"][0]["model"]["device"] if pl["postprocessor"][0]["model"] is not None else "",
             )
 
         def modify_create_pipeline_button():
@@ -502,6 +438,7 @@ def create_update_pipeline(
             vector_search_top_k,
             postprocessor,
             generator,
+            llm_infertype,
             llm_id,
             llm_device,
             llm_weights,
@@ -521,6 +458,7 @@ def create_update_pipeline(
                 vector_search_top_k,
                 postprocessor,
                 generator,
+                llm_infertype,
                 llm_id,
                 llm_device,
                 llm_weights,
@@ -548,17 +486,18 @@ def create_update_pipeline(
                 u_retriever,
                 u_vector_search_top_k,
                 # postprocessor
-                # u_postprocessor,
+                u_postprocessor,
                 # generator
                 u_generator,
+                u_llm_infertype,
                 # models
                 u_llm_model_id,
                 u_llm_device,
                 u_llm_weights,
                 u_embed_model_id,
                 u_embed_device,
-                # u_rerank_model_id,
-                # u_rerank_device
+                u_rerank_model_id,
+                u_rerank_device,
             ],
         )
 
@@ -586,6 +525,7 @@ def create_update_pipeline(
                 u_llm_model_id.input,
                 u_llm_device.input,
                 u_llm_weights.input,
+                u_llm_infertype.input,
                 u_embed_model_id.input,
                 u_embed_device.input,
                 u_rerank_model_id.input,
@@ -609,6 +549,7 @@ def create_update_pipeline(
                 u_vector_search_top_k,
                 u_postprocessor,
                 u_generator,
+                u_llm_infertype,
                 u_llm_model_id,
                 u_llm_device,
                 u_llm_weights,
@@ -634,8 +575,8 @@ def create_update_pipeline(
         def get_files():
             return cli.get_files()
 
-        def create_vectordb(docs, spliter, vector_db):
-            res = cli.create_vectordb(docs, spliter, vector_db)
+        def create_vectordb(docs, spliter):
+            res = cli.create_vectordb(docs, spliter)
             return gr.update(value=get_files()), res
 
         global u_files_selected_row
@@ -696,13 +637,6 @@ def delete_file():
                             multiselect=False,
                         )
 
-                        vector_db = gr.Dropdown(
-                            ["FAISS", "Chroma"],
-                            value=cfg.vector_db,
-                            label="Vector Stores",
-                            info="Stores embedded data and performs vector search.",
-                            multiselect=False,
-                        )
                     load_docs = gr.Button("Upload files")
 
                     u_files_status = gr.Textbox(label="File Processing Status", value="", interactive=False)
@@ -723,12 +657,6 @@ def delete_file():
                             with gr.Column():
                                 deselect_button = gr.Button("Clear Selection")
 
-                    do_rag = gr.Checkbox(
-                        value=True,
-                        label="RAG is ON",
-                        interactive=True,
-                        info="Whether to do RAG for generation",
-                    )
                     with gr.Accordion("Generation Configuration", open=False):
                         with gr.Row():
                             with gr.Column():
@@ -778,6 +706,17 @@ def delete_file():
                                         interactive=True,
                                         info="Penalize repetition — 1.0 to disable.",
                                     )
+                            with gr.Column():
+                                with gr.Row():
+                                    u_max_tokens = gr.Slider(
+                                        label="Max Token Number",
+                                        value=512,
+                                        minimum=1,
+                                        maximum=8192,
+                                        step=10,
+                                        interactive=True,
+                                        info="Set Max Output Token",
+                                    )
                 with gr.Column(scale=4):
                     chatbot = gr.Chatbot(
                         height=600,
@@ -795,7 +734,6 @@ def delete_file():
                         with gr.Column():
                             with gr.Row():
                                 submit = gr.Button("Submit")
-                                stop = gr.Button("Stop")
                                 clear = gr.Button("Clear")
                     retriever_argument = gr.Accordion("Retriever Configuration", open=True)
                     with retriever_argument:
@@ -845,7 +783,6 @@ def delete_file():
             inputs=[
                 docs,
                 spliter,
-                vector_db,
             ],
             outputs=[u_files, u_files_status],
             queue=True,
@@ -873,11 +810,9 @@ def delete_file():
                 top_p,
                 top_k,
                 repetition_penalty,
+                u_max_tokens,
                 hide_context,
-                do_rag,
                 docs,
-                spliter,
-                vector_db,
                 u_chunk_size,
                 u_chunk_overlap,
                 u_vector_search_top_k,
@@ -897,11 +832,9 @@ def delete_file():
                 top_p,
                 top_k,
                 repetition_penalty,
+                u_max_tokens,
                 hide_context,
-                do_rag,
                 docs,
-                spliter,
-                vector_db,
                 u_chunk_size,
                 u_chunk_overlap,
                 u_vector_search_top_k,
@@ -913,15 +846,8 @@ def delete_file():
             chatbot,
             queue=True,
         )
-        # stop.click(
-        #     fn=request_cancel,
-        #     inputs=None,
-        #     outputs=None,
-        #     cancels=[submit_event, submit_click_event],
-        #     queue=False,
-        # )
         clear.click(lambda: None, None, chatbot, queue=False)
-    return demo
+    return app
 
 
 def main():
@@ -929,8 +855,6 @@ def main():
     parser = argparse.ArgumentParser(description="Load Embedding and LLM Models with OpenVino.")
     # Add the arguments
     parser.add_argument("--prompt_template", type=str, required=False, help="User specific template")
-    # parser.add_argument("--server_name", type=str, default="0.0.0.0")
-    # parser.add_argument("--server_port", type=int, default=8082)
     parser.add_argument("--config", type=str, default="./default.yaml", help="configuration file path")
     parser.add_argument("--share", action="store_true", help="share model")
     parser.add_argument("--debug", action="store_true", help="enable debugging")
@@ -942,20 +866,20 @@ def main():
     init_cfg_(cfg)
     logger.info(cfg)
 
-    demo = build_demo(cfg, args)
+    app = build_app(cfg, args)
     # if you are launching remotely, specify server_name and server_port
-    # demo.launch(server_name='your server name', server_port='server port in int')
+    # app.launch(server_name='your server name', server_port='server port in int')
     # if you have any issue to launch on your platform, you can pass share=True to launch method:
-    # demo.launch(share=True)
+    # app.launch(share=True)
     # it creates a publicly shareable link for the interface. Read more in the docs: https://gradio.app/docs/
-    # demo.launch(share=True)
-    demo.queue().launch(
+    # app.launch(share=True)
+    app.queue().launch(
         server_name=UI_SERVICE_HOST_IP, server_port=UI_SERVICE_PORT, share=args.share, allowed_paths=["."]
     )
 
     # %%
     # please run this cell for stopping gradio interface
-    demo.close()
+    app.close()
 
 
 def init_cfg_(cfg):
@@ -969,14 +893,14 @@ def init_cfg_(cfg):
         cfg.llm_device = "CPU"
     if "model_language" not in cfg:
         cfg.model_language = "Chinese"
-    if "vector_db" not in cfg:
-        cfg.vector_db = "FAISS"
     if "splitter_name" not in cfg:
         cfg.splitter_name = "RecursiveCharacter"  # or "Chinese"
     if "search_method" not in cfg:
         cfg.search_method = "similarity"
     if "score_threshold" not in cfg:
         cfg.score_threshold = 0.5
+    if "llm_weights" not in cfg:
+        cfg.llm_weights = "FP16"
 
 
 if __name__ == "__main__":
diff --git a/EdgeCraftRAG/ui/gradio/platform_config.py b/EdgeCraftRAG/ui/gradio/platform_config.py
index 852409c1c..3fc3155f9 100644
--- a/EdgeCraftRAG/ui/gradio/platform_config.py
+++ b/EdgeCraftRAG/ui/gradio/platform_config.py
@@ -90,6 +90,11 @@ def get_available_weights():
     return avail_weights_compression
 
 
+def get_avail_llm_inference_type():
+    avail_llm_inference_type = ["local", "vllm"]
+    return avail_llm_inference_type
+
+
 def get_enum_values(c: Enum):
     return [v.value for k, v in vars(c).items() if not callable(v) and not k.startswith("__") and not k.startswith("_")]
 
@@ -112,3 +117,25 @@ def get_available_postprocessors():
 
 def get_available_generators():
     return get_enum_values(GeneratorType)
+
+
+def get_llm_model_dir(prefix, llm_model_id, weights_compression):
+    model_dirs = {
+        "fp16_model_dir": prefix + llm_model_id + "/FP16",
+        "int8_model_dir": prefix + llm_model_id + "/INT8_compressed_weights",
+        "int4_model_dir": prefix + llm_model_id + "/INT4_compressed_weights",
+    }
+
+    if weights_compression == "INT4":
+        model_dir = model_dirs["int4_model_dir"]
+    elif weights_compression == "INT8":
+        model_dir = model_dirs["int8_model_dir"]
+    else:
+        model_dir = model_dirs["fp16_model_dir"]
+
+    # if not model_dir.exists():
+    #     raise FileNotFoundError(f"The model directory {model_dir} does not exist.")
+    # elif not model_dir.is_dir():
+    #     raise NotADirectoryError(f"The path {model_dir} is not a directory.")
+
+    return model_dir
diff --git a/EdgeCraftRAG/ui/gradio/requirements.txt b/EdgeCraftRAG/ui/gradio/requirements.txt
new file mode 100644
index 000000000..22bcd0524
--- /dev/null
+++ b/EdgeCraftRAG/ui/gradio/requirements.txt
@@ -0,0 +1,8 @@
+distro>=1.9.0
+gradio>=4.44.1
+loguru>=0.7.2
+omegaconf>=2.3.0
+openvino>=2024.4.0
+psutil>=6.1.0
+py-cpuinfo>=9.0.0
+uvicorn>=0.30.6