diff --git a/AudioQnA/benchmark/accuracy/README.md b/AudioQnA/benchmark/accuracy/README.md
new file mode 100644
index 000000000..67119121a
--- /dev/null
+++ b/AudioQnA/benchmark/accuracy/README.md
@@ -0,0 +1,51 @@
+# AudioQnA accuracy Evaluation
+
+AudioQnA is an example that demonstrates the integration of Generative AI (GenAI) models for performing question-answering (QnA) on audio scene, which contains Automatic Speech Recognition (ASR) and Text-to-Speech (TTS). The following is the piepline for evaluating the ASR accuracy.
+
+## Dataset
+
+We evaluate the ASR accuracy on the test set of librispeech [dataset](https://huggingface.co/datasets/andreagasparini/librispeech_test_only), which contains 2620 records of audio and texts.
+
+## Metrics
+
+We evaluate the WER (Word Error Rate) metric of the ASR microservice.
+
+## Evaluation
+
+### Launch ASR microservice
+
+Launch the ASR microserice with the following commands. For more details please refer to [doc](https://github.com/opea-project/GenAIComps/tree/main/comps/asr).
+
+```bash
+git clone https://github.com/opea-project/GenAIComps
+cd GenAIComps
+docker build -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/whisper/Dockerfile .
+# change the name of model by editing model_name_or_path you want to evaluate
+docker run -p 7066:7066 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/whisper:latest --model_name_or_path "openai/whisper-tiny"
+```
+
+### Evaluate
+
+Install dependencies:
+
+```
+pip install -r requirements.txt
+```
+
+Evaluate the performance with the LLM:
+
+```py
+# validate the offline model
+# python offline_evaluate.py
+# validate the online asr microservice accuracy
+python online_evaluate.py
+```
+
+### Performance Result
+
+Here is the tested result for your reference
+|| WER |
+| --- | ---- |
+|whisper-large-v2| 2.87|
+|whisper-large| 2.7 |
+|whisper-medium| 3.45 |
diff --git a/AudioQnA/benchmark/accuracy/local_eval.py b/AudioQnA/benchmark/accuracy/local_eval.py
new file mode 100644
index 000000000..1ef7b6dfa
--- /dev/null
+++ b/AudioQnA/benchmark/accuracy/local_eval.py
@@ -0,0 +1,35 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+from datasets import load_dataset
+from evaluate import load
+from transformers import WhisperForConditionalGeneration, WhisperProcessor
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+MODEL_NAME = "openai/whisper-large-v2"
+
+librispeech_test_clean = load_dataset(
+    "andreagasparini/librispeech_test_only", "clean", split="test", trust_remote_code=True
+)
+processor = WhisperProcessor.from_pretrained(MODEL_NAME)
+model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)
+
+
+def map_to_pred(batch):
+    audio = batch["audio"]
+    input_features = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features
+    batch["reference"] = processor.tokenizer._normalize(batch["text"])
+
+    with torch.no_grad():
+        predicted_ids = model.generate(input_features.to(device))[0]
+    transcription = processor.decode(predicted_ids)
+    batch["prediction"] = processor.tokenizer._normalize(transcription)
+    return batch
+
+
+result = librispeech_test_clean.map(map_to_pred)
+
+wer = load("wer")
+print(100 * wer.compute(references=result["reference"], predictions=result["prediction"]))
diff --git a/AudioQnA/benchmark/accuracy/online_eval.py b/AudioQnA/benchmark/accuracy/online_eval.py
new file mode 100644
index 000000000..a7854c95b
--- /dev/null
+++ b/AudioQnA/benchmark/accuracy/online_eval.py
@@ -0,0 +1,56 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import base64
+import json
+
+import requests
+import torch
+from datasets import load_dataset
+from evaluate import load
+from pydub import AudioSegment
+from transformers import WhisperForConditionalGeneration, WhisperProcessor
+
+MODEL_NAME = "openai/whisper-large-v2"
+processor = WhisperProcessor.from_pretrained(MODEL_NAME)
+
+librispeech_test_clean = load_dataset(
+    "andreagasparini/librispeech_test_only", "clean", split="test", trust_remote_code=True
+)
+
+
+def map_to_pred(batch):
+    batch["reference"] = processor.tokenizer._normalize(batch["text"])
+
+    file_path = batch["file"]
+    # process the file_path
+    pidx = file_path.rfind("/")
+    sidx = file_path.rfind(".")
+
+    file_path_prefix = file_path[: pidx + 1]
+    file_path_suffix = file_path[sidx:]
+    file_path_mid = file_path[pidx + 1 : sidx]
+    splits = file_path_mid.split("-")
+    file_path_mid = f"LibriSpeech/test-clean/{splits[0]}/{splits[1]}/{file_path_mid}"
+
+    file_path = file_path_prefix + file_path_mid + file_path_suffix
+
+    audio = AudioSegment.from_file(file_path)
+    audio.export("tmp.wav")
+    with open("tmp.wav", "rb") as f:
+        test_audio_base64_str = base64.b64encode(f.read()).decode("utf-8")
+
+    inputs = {"audio": test_audio_base64_str}
+    endpoint = "http://localhost:7066/v1/asr"
+    response = requests.post(url=endpoint, data=json.dumps(inputs), proxies={"http": None})
+
+    result_str = response.json()["asr_result"]
+
+    batch["prediction"] = processor.tokenizer._normalize(result_str)
+    return batch
+
+
+result = librispeech_test_clean.map(map_to_pred)
+
+wer = load("wer")
+print(100 * wer.compute(references=result["reference"], predictions=result["prediction"]))
diff --git a/AudioQnA/benchmark/accuracy/requirements.txt b/AudioQnA/benchmark/accuracy/requirements.txt
new file mode 100644
index 000000000..c3f6c51a1
--- /dev/null
+++ b/AudioQnA/benchmark/accuracy/requirements.txt
@@ -0,0 +1,8 @@
+datasets
+evaluate
+jiwer
+librosa
+pydub
+soundfile
+torch
+transformers
diff --git a/AudioQnA/docker_compose/intel/cpu/xeon/README.md b/AudioQnA/docker_compose/intel/cpu/xeon/README.md
index 338771dd0..d08061284 100644
--- a/AudioQnA/docker_compose/intel/cpu/xeon/README.md
+++ b/AudioQnA/docker_compose/intel/cpu/xeon/README.md
@@ -108,7 +108,7 @@ curl http://${host_ip}:3006/generate \
 # llm microservice
 curl http://${host_ip}:3007/v1/chat/completions\
   -X POST \
-  -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":false}' \
+  -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":false}' \
   -H 'Content-Type: application/json'
 
 # speecht5 service
diff --git a/AudioQnA/docker_compose/intel/hpu/gaudi/README.md b/AudioQnA/docker_compose/intel/hpu/gaudi/README.md
index 28ec3f402..842227ee5 100644
--- a/AudioQnA/docker_compose/intel/hpu/gaudi/README.md
+++ b/AudioQnA/docker_compose/intel/hpu/gaudi/README.md
@@ -108,7 +108,7 @@ curl http://${host_ip}:3006/generate \
 # llm microservice
 curl http://${host_ip}:3007/v1/chat/completions\
   -X POST \
-  -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":false}' \
+  -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":false}' \
   -H 'Content-Type: application/json'
 
 # speecht5 service
diff --git a/AudioQnA/tests/test_gmc_on_gaudi.sh b/AudioQnA/tests/test_gmc_on_gaudi.sh
index 898a91524..d90bd3624 100755
--- a/AudioQnA/tests/test_gmc_on_gaudi.sh
+++ b/AudioQnA/tests/test_gmc_on_gaudi.sh
@@ -34,7 +34,7 @@ function validate_audioqa() {
     export CLIENT_POD=$(kubectl get pod -n $APP_NAMESPACE -l app=client-test -o jsonpath={.items..metadata.name})
     echo "$CLIENT_POD"
     accessUrl=$(kubectl get gmc -n $APP_NAMESPACE -o jsonpath="{.items[?(@.metadata.name=='audioqa')].status.accessUrl}")
-    byte_str=$(kubectl exec "$CLIENT_POD" -n $APP_NAMESPACE -- curl $accessUrl -s -X POST  -d '{"byte_str": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "parameters":{"max_new_tokens":64, "do_sample": true, "streaming":false}}' -H 'Content-Type: application/json' | jq .byte_str)
+    byte_str=$(kubectl exec "$CLIENT_POD" -n $APP_NAMESPACE -- curl $accessUrl -s -X POST  -d '{"byte_str": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "parameters":{"max_tokens":64, "do_sample": true, "streaming":false}}' -H 'Content-Type: application/json' | jq .byte_str)
     echo "$byte_str" > $LOG_PATH/curl_audioqa.log
     if [ -z "$byte_str" ]; then
 	echo "audioqa failed, please check the logs in ${LOG_PATH}!"
diff --git a/AudioQnA/tests/test_gmc_on_xeon.sh b/AudioQnA/tests/test_gmc_on_xeon.sh
index ed6adddd2..15e04e62c 100755
--- a/AudioQnA/tests/test_gmc_on_xeon.sh
+++ b/AudioQnA/tests/test_gmc_on_xeon.sh
@@ -34,7 +34,7 @@ function validate_audioqa() {
     export CLIENT_POD=$(kubectl get pod -n $APP_NAMESPACE -l app=client-test -o jsonpath={.items..metadata.name})
     echo "$CLIENT_POD"
     accessUrl=$(kubectl get gmc -n $APP_NAMESPACE -o jsonpath="{.items[?(@.metadata.name=='audioqa')].status.accessUrl}")
-    byte_str=$(kubectl exec "$CLIENT_POD" -n $APP_NAMESPACE -- curl $accessUrl -s -X POST  -d '{"byte_str": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "parameters":{"max_new_tokens":64, "do_sample": true, "streaming":false}}' -H 'Content-Type: application/json' | jq .byte_str)
+    byte_str=$(kubectl exec "$CLIENT_POD" -n $APP_NAMESPACE -- curl $accessUrl -s -X POST  -d '{"byte_str": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "parameters":{"max_tokens":64, "do_sample": true, "streaming":false}}' -H 'Content-Type: application/json' | jq .byte_str)
     echo "$byte_str" > $LOG_PATH/curl_audioqa.log
     if [ -z "$byte_str" ]; then
         echo "audioqa failed, please check the logs in ${LOG_PATH}!"
diff --git a/ChatQnA/README.md b/ChatQnA/README.md
index fa7156ad0..4f56abf05 100644
--- a/ChatQnA/README.md
+++ b/ChatQnA/README.md
@@ -245,7 +245,9 @@ Refer to the [AI PC Guide](./docker_compose/intel/cpu/aipc/README.md) for instru
 
 Refer to the [Intel Technology enabling for Openshift readme](https://github.com/intel/intel-technology-enabling-for-openshift/blob/main/workloads/opea/chatqna/README.md) for instructions to deploy ChatQnA prototype on RHOCP with [Red Hat OpenShift AI (RHOAI)](https://www.redhat.com/en/technologies/cloud-computing/openshift/openshift-ai).
 
-## Consume ChatQnA Service
+## Consume ChatQnA Service with RAG
+
+### Check Service Status
 
 Before consuming ChatQnA Service, make sure the TGI/vLLM service is ready (which takes up to 2 minutes to start).
 
@@ -260,6 +262,23 @@ Consume ChatQnA service until you get the TGI response like below.
 2024-09-03T02:47:53.402023Z  INFO text_generation_router::server: router/src/server.rs:2311: Connected
 ```
 
+### Upload RAG Files (Optional)
+
+To chat with retrieved information, you need to upload a file using `Dataprep` service.
+
+Here is an example of `Nike 2023` pdf.
+
+```bash
+# download pdf file
+wget https://raw.githubusercontent.com/opea-project/GenAIComps/main/comps/retrievers/redis/data/nke-10k-2023.pdf
+# upload pdf file with dataprep
+curl -X POST "http://${host_ip}:6007/v1/dataprep" \
+    -H "Content-Type: multipart/form-data" \
+    -F "files=@./nke-10k-2023.pdf"
+```
+
+### Consume Chat Service
+
 Two ways of consuming ChatQnA Service:
 
 1. Use cURL command on terminal
diff --git a/ChatQnA/benchmark/oob/with_rerank/four_gaudi/oob_four_gaudi_with_rerank.yaml b/ChatQnA/benchmark/oob/with_rerank/four_gaudi/oob_four_gaudi_with_rerank.yaml
deleted file mode 100644
index c447bcec2..000000000
--- a/ChatQnA/benchmark/oob/with_rerank/four_gaudi/oob_four_gaudi_with_rerank.yaml
+++ /dev/null
@@ -1,641 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-data:
-  EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5
-  EMBEDDING_SERVICE_HOST_IP: embedding-svc
-  HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
-  INDEX_NAME: rag-redis
-  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
-  LLM_SERVICE_HOST_IP: llm-svc
-  NODE_SELECTOR: chatqna-opea
-  REDIS_URL: redis://vector-db.default.svc.cluster.local:6379
-  RERANK_MODEL_ID: BAAI/bge-reranker-base
-  RERANK_SERVICE_HOST_IP: reranking-svc
-  RETRIEVER_SERVICE_HOST_IP: retriever-svc
-  TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006
-  TEI_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006
-  TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808
-  TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009
-kind: ConfigMap
-metadata:
-  name: qna-config
-  namespace: default
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: chatqna-backend-server-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: chatqna-backend-server-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: chatqna-backend-server-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/chatqna:latest
-        imagePullPolicy: IfNotPresent
-        name: chatqna-backend-server-deploy
-        ports:
-        - containerPort: 8888
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: chatqna-backend-server-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: chatqna-backend-server-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    nodePort: 30888
-    port: 8888
-    targetPort: 8888
-  selector:
-    app: chatqna-backend-server-deploy
-  type: NodePort
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: dataprep-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: dataprep-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: dataprep-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/dataprep-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: dataprep-deploy
-        ports:
-        - containerPort: 6007
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: dataprep-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: dataprep-svc
-  namespace: default
-spec:
-  ports:
-  - name: port1
-    port: 6007
-    targetPort: 6007
-  selector:
-    app: dataprep-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-dependency-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: embedding-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-dependency-deploy
-    spec:
-      containers:
-      - args:
-        - --model-id
-        - $(EMBEDDING_MODEL_ID)
-        - --auto-truncate
-        envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-        imagePullPolicy: IfNotPresent
-        name: embedding-dependency-deploy
-        ports:
-        - containerPort: 80
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: embedding-dependency-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-      volumes:
-      - hostPath:
-          path: /mnt/models
-          type: Directory
-        name: model-volume
-      - emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
-        name: shm
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: embedding-dependency-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 6006
-    targetPort: 80
-  selector:
-    app: embedding-dependency-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: embedding-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/embedding-tei:latest
-        imagePullPolicy: IfNotPresent
-        name: embedding-deploy
-        ports:
-        - containerPort: 6000
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: embedding-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: embedding-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 6000
-    targetPort: 6000
-  selector:
-    app: embedding-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-dependency-deploy
-  namespace: default
-spec:
-  replicas: 31
-  selector:
-    matchLabels:
-      app: llm-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-dependency-deploy
-    spec:
-      containers:
-      - args:
-        - --model-id
-        - $(LLM_MODEL_ID)
-        - --max-input-length
-        - '2048'
-        - --max-total-tokens
-        - '4096'
-        env:
-        - name: OMPI_MCA_btl_vader_single_copy_mechanism
-          value: none
-        - name: PT_HPU_ENABLE_LAZY_COLLECTIVES
-          value: 'true'
-        - name: runtime
-          value: habana
-        - name: HABANA_VISIBLE_DEVICES
-          value: all
-        - name: HF_TOKEN
-          value: ${HF_TOKEN}
-        envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/tgi-gaudi:2.0.4
-        imagePullPolicy: IfNotPresent
-        name: llm-dependency-deploy
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            habana.ai/gaudi: 1
-        securityContext:
-          capabilities:
-            add:
-            - SYS_NICE
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: llm-dependency-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-      volumes:
-      - hostPath:
-          path: /mnt/models
-          type: Directory
-        name: model-volume
-      - emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
-        name: shm
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: llm-dependency-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 9009
-    targetPort: 80
-  selector:
-    app: llm-dependency-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: llm-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/llm-tgi:latest
-        imagePullPolicy: IfNotPresent
-        name: llm-deploy
-        ports:
-        - containerPort: 9000
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: llm-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: llm-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 9000
-    targetPort: 9000
-  selector:
-    app: llm-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: reranking-dependency-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: reranking-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: reranking-dependency-deploy
-    spec:
-      containers:
-      - args:
-        - --model-id
-        - $(RERANK_MODEL_ID)
-        - --auto-truncate
-        env:
-        - name: OMPI_MCA_btl_vader_single_copy_mechanism
-          value: none
-        - name: PT_HPU_ENABLE_LAZY_COLLECTIVES
-          value: 'true'
-        - name: runtime
-          value: habana
-        - name: HABANA_VISIBLE_DEVICES
-          value: all
-        - name: HF_TOKEN
-          value: ${HF_TOKEN}
-        - name: MAX_WARMUP_SEQUENCE_LENGTH
-          value: '512'
-        envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/tei-gaudi:latest
-        imagePullPolicy: IfNotPresent
-        name: reranking-dependency-deploy
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            habana.ai/gaudi: 1
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: reranking-dependency-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-      volumes:
-      - hostPath:
-          path: /mnt/models
-          type: Directory
-        name: model-volume
-      - emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
-        name: shm
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: reranking-dependency-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 8808
-    targetPort: 80
-  selector:
-    app: reranking-dependency-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: reranking-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: reranking-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: reranking-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/reranking-tei:latest
-        imagePullPolicy: IfNotPresent
-        name: reranking-deploy
-        ports:
-        - containerPort: 8000
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: reranking-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: reranking-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 8000
-    targetPort: 8000
-  selector:
-    app: reranking-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: retriever-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: retriever-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: retriever-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/retriever-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: retriever-deploy
-        ports:
-        - containerPort: 7000
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: retriever-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: retriever-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 7000
-    targetPort: 7000
-  selector:
-    app: retriever-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: vector-db
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: vector-db
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: redis/redis-stack:7.2.0-v9
-        imagePullPolicy: IfNotPresent
-        name: vector-db
-        ports:
-        - containerPort: 6379
-        - containerPort: 8001
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: vector-db
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  ports:
-  - name: vector-db-service
-    port: 6379
-    targetPort: 6379
-  - name: vector-db-insight
-    port: 8001
-    targetPort: 8001
-  selector:
-    app: vector-db
-  type: ClusterIP
----
diff --git a/ChatQnA/benchmark/oob/with_rerank/single_gaudi/oob_single_gaudi_with_rerank.yaml b/ChatQnA/benchmark/oob/with_rerank/single_gaudi/oob_single_gaudi_with_rerank.yaml
deleted file mode 100644
index 859568ef9..000000000
--- a/ChatQnA/benchmark/oob/with_rerank/single_gaudi/oob_single_gaudi_with_rerank.yaml
+++ /dev/null
@@ -1,641 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-data:
-  EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5
-  EMBEDDING_SERVICE_HOST_IP: embedding-svc
-  HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
-  INDEX_NAME: rag-redis
-  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
-  LLM_SERVICE_HOST_IP: llm-svc
-  NODE_SELECTOR: chatqna-opea
-  REDIS_URL: redis://vector-db.default.svc.cluster.local:6379
-  RERANK_MODEL_ID: BAAI/bge-reranker-base
-  RERANK_SERVICE_HOST_IP: reranking-svc
-  RETRIEVER_SERVICE_HOST_IP: retriever-svc
-  TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006
-  TEI_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006
-  TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808
-  TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009
-kind: ConfigMap
-metadata:
-  name: qna-config
-  namespace: default
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: chatqna-backend-server-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: chatqna-backend-server-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: chatqna-backend-server-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/chatqna:latest
-        imagePullPolicy: IfNotPresent
-        name: chatqna-backend-server-deploy
-        ports:
-        - containerPort: 8888
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: chatqna-backend-server-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: chatqna-backend-server-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    nodePort: 30888
-    port: 8888
-    targetPort: 8888
-  selector:
-    app: chatqna-backend-server-deploy
-  type: NodePort
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: dataprep-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: dataprep-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: dataprep-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/dataprep-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: dataprep-deploy
-        ports:
-        - containerPort: 6007
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: dataprep-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: dataprep-svc
-  namespace: default
-spec:
-  ports:
-  - name: port1
-    port: 6007
-    targetPort: 6007
-  selector:
-    app: dataprep-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-dependency-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: embedding-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-dependency-deploy
-    spec:
-      containers:
-      - args:
-        - --model-id
-        - $(EMBEDDING_MODEL_ID)
-        - --auto-truncate
-        envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-        imagePullPolicy: IfNotPresent
-        name: embedding-dependency-deploy
-        ports:
-        - containerPort: 80
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: embedding-dependency-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-      volumes:
-      - hostPath:
-          path: /mnt/models
-          type: Directory
-        name: model-volume
-      - emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
-        name: shm
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: embedding-dependency-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 6006
-    targetPort: 80
-  selector:
-    app: embedding-dependency-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: embedding-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/embedding-tei:latest
-        imagePullPolicy: IfNotPresent
-        name: embedding-deploy
-        ports:
-        - containerPort: 6000
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: embedding-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: embedding-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 6000
-    targetPort: 6000
-  selector:
-    app: embedding-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-dependency-deploy
-  namespace: default
-spec:
-  replicas: 7
-  selector:
-    matchLabels:
-      app: llm-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-dependency-deploy
-    spec:
-      containers:
-      - args:
-        - --model-id
-        - $(LLM_MODEL_ID)
-        - --max-input-length
-        - '2048'
-        - --max-total-tokens
-        - '4096'
-        env:
-        - name: OMPI_MCA_btl_vader_single_copy_mechanism
-          value: none
-        - name: PT_HPU_ENABLE_LAZY_COLLECTIVES
-          value: 'true'
-        - name: runtime
-          value: habana
-        - name: HABANA_VISIBLE_DEVICES
-          value: all
-        - name: HF_TOKEN
-          value: ${HF_TOKEN}
-        envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/tgi-gaudi:2.0.4
-        imagePullPolicy: IfNotPresent
-        name: llm-dependency-deploy
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            habana.ai/gaudi: 1
-        securityContext:
-          capabilities:
-            add:
-            - SYS_NICE
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: llm-dependency-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-      volumes:
-      - hostPath:
-          path: /mnt/models
-          type: Directory
-        name: model-volume
-      - emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
-        name: shm
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: llm-dependency-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 9009
-    targetPort: 80
-  selector:
-    app: llm-dependency-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: llm-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/llm-tgi:latest
-        imagePullPolicy: IfNotPresent
-        name: llm-deploy
-        ports:
-        - containerPort: 9000
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: llm-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: llm-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 9000
-    targetPort: 9000
-  selector:
-    app: llm-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: reranking-dependency-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: reranking-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: reranking-dependency-deploy
-    spec:
-      containers:
-      - args:
-        - --model-id
-        - $(RERANK_MODEL_ID)
-        - --auto-truncate
-        env:
-        - name: OMPI_MCA_btl_vader_single_copy_mechanism
-          value: none
-        - name: PT_HPU_ENABLE_LAZY_COLLECTIVES
-          value: 'true'
-        - name: runtime
-          value: habana
-        - name: HABANA_VISIBLE_DEVICES
-          value: all
-        - name: HF_TOKEN
-          value: ${HF_TOKEN}
-        - name: MAX_WARMUP_SEQUENCE_LENGTH
-          value: '512'
-        envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/tei-gaudi:latest
-        imagePullPolicy: IfNotPresent
-        name: reranking-dependency-deploy
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            habana.ai/gaudi: 1
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: reranking-dependency-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-      volumes:
-      - hostPath:
-          path: /mnt/models
-          type: Directory
-        name: model-volume
-      - emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
-        name: shm
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: reranking-dependency-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 8808
-    targetPort: 80
-  selector:
-    app: reranking-dependency-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: reranking-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: reranking-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: reranking-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/reranking-tei:latest
-        imagePullPolicy: IfNotPresent
-        name: reranking-deploy
-        ports:
-        - containerPort: 8000
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: reranking-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: reranking-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 8000
-    targetPort: 8000
-  selector:
-    app: reranking-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: retriever-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: retriever-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: retriever-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/retriever-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: retriever-deploy
-        ports:
-        - containerPort: 7000
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: retriever-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: retriever-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 7000
-    targetPort: 7000
-  selector:
-    app: retriever-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: vector-db
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: vector-db
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: redis/redis-stack:7.2.0-v9
-        imagePullPolicy: IfNotPresent
-        name: vector-db
-        ports:
-        - containerPort: 6379
-        - containerPort: 8001
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: vector-db
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  ports:
-  - name: vector-db-service
-    port: 6379
-    targetPort: 6379
-  - name: vector-db-insight
-    port: 8001
-    targetPort: 8001
-  selector:
-    app: vector-db
-  type: ClusterIP
----
diff --git a/ChatQnA/benchmark/oob/with_rerank/two_gaudi/oob_two_gaudi_with_rerank.yaml b/ChatQnA/benchmark/oob/with_rerank/two_gaudi/oob_two_gaudi_with_rerank.yaml
deleted file mode 100644
index b64263be1..000000000
--- a/ChatQnA/benchmark/oob/with_rerank/two_gaudi/oob_two_gaudi_with_rerank.yaml
+++ /dev/null
@@ -1,641 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-data:
-  EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5
-  EMBEDDING_SERVICE_HOST_IP: embedding-svc
-  HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
-  INDEX_NAME: rag-redis
-  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
-  LLM_SERVICE_HOST_IP: llm-svc
-  NODE_SELECTOR: chatqna-opea
-  REDIS_URL: redis://vector-db.default.svc.cluster.local:6379
-  RERANK_MODEL_ID: BAAI/bge-reranker-base
-  RERANK_SERVICE_HOST_IP: reranking-svc
-  RETRIEVER_SERVICE_HOST_IP: retriever-svc
-  TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006
-  TEI_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006
-  TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808
-  TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009
-kind: ConfigMap
-metadata:
-  name: qna-config
-  namespace: default
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: chatqna-backend-server-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: chatqna-backend-server-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: chatqna-backend-server-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/chatqna:latest
-        imagePullPolicy: IfNotPresent
-        name: chatqna-backend-server-deploy
-        ports:
-        - containerPort: 8888
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: chatqna-backend-server-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: chatqna-backend-server-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    nodePort: 30888
-    port: 8888
-    targetPort: 8888
-  selector:
-    app: chatqna-backend-server-deploy
-  type: NodePort
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: dataprep-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: dataprep-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: dataprep-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/dataprep-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: dataprep-deploy
-        ports:
-        - containerPort: 6007
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: dataprep-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: dataprep-svc
-  namespace: default
-spec:
-  ports:
-  - name: port1
-    port: 6007
-    targetPort: 6007
-  selector:
-    app: dataprep-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-dependency-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: embedding-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-dependency-deploy
-    spec:
-      containers:
-      - args:
-        - --model-id
-        - $(EMBEDDING_MODEL_ID)
-        - --auto-truncate
-        envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-        imagePullPolicy: IfNotPresent
-        name: embedding-dependency-deploy
-        ports:
-        - containerPort: 80
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: embedding-dependency-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-      volumes:
-      - hostPath:
-          path: /mnt/models
-          type: Directory
-        name: model-volume
-      - emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
-        name: shm
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: embedding-dependency-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 6006
-    targetPort: 80
-  selector:
-    app: embedding-dependency-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: embedding-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/embedding-tei:latest
-        imagePullPolicy: IfNotPresent
-        name: embedding-deploy
-        ports:
-        - containerPort: 6000
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: embedding-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: embedding-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 6000
-    targetPort: 6000
-  selector:
-    app: embedding-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-dependency-deploy
-  namespace: default
-spec:
-  replicas: 15
-  selector:
-    matchLabels:
-      app: llm-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-dependency-deploy
-    spec:
-      containers:
-      - args:
-        - --model-id
-        - $(LLM_MODEL_ID)
-        - --max-input-length
-        - '2048'
-        - --max-total-tokens
-        - '4096'
-        env:
-        - name: OMPI_MCA_btl_vader_single_copy_mechanism
-          value: none
-        - name: PT_HPU_ENABLE_LAZY_COLLECTIVES
-          value: 'true'
-        - name: runtime
-          value: habana
-        - name: HABANA_VISIBLE_DEVICES
-          value: all
-        - name: HF_TOKEN
-          value: ${HF_TOKEN}
-        envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/tgi-gaudi:2.0.4
-        imagePullPolicy: IfNotPresent
-        name: llm-dependency-deploy
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            habana.ai/gaudi: 1
-        securityContext:
-          capabilities:
-            add:
-            - SYS_NICE
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: llm-dependency-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-      volumes:
-      - hostPath:
-          path: /mnt/models
-          type: Directory
-        name: model-volume
-      - emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
-        name: shm
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: llm-dependency-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 9009
-    targetPort: 80
-  selector:
-    app: llm-dependency-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: llm-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/llm-tgi:latest
-        imagePullPolicy: IfNotPresent
-        name: llm-deploy
-        ports:
-        - containerPort: 9000
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: llm-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: llm-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 9000
-    targetPort: 9000
-  selector:
-    app: llm-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: reranking-dependency-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: reranking-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: reranking-dependency-deploy
-    spec:
-      containers:
-      - args:
-        - --model-id
-        - $(RERANK_MODEL_ID)
-        - --auto-truncate
-        env:
-        - name: OMPI_MCA_btl_vader_single_copy_mechanism
-          value: none
-        - name: PT_HPU_ENABLE_LAZY_COLLECTIVES
-          value: 'true'
-        - name: runtime
-          value: habana
-        - name: HABANA_VISIBLE_DEVICES
-          value: all
-        - name: HF_TOKEN
-          value: ${HF_TOKEN}
-        - name: MAX_WARMUP_SEQUENCE_LENGTH
-          value: '512'
-        envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/tei-gaudi:latest
-        imagePullPolicy: IfNotPresent
-        name: reranking-dependency-deploy
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            habana.ai/gaudi: 1
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: reranking-dependency-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-      volumes:
-      - hostPath:
-          path: /mnt/models
-          type: Directory
-        name: model-volume
-      - emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
-        name: shm
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: reranking-dependency-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 8808
-    targetPort: 80
-  selector:
-    app: reranking-dependency-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: reranking-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: reranking-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: reranking-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/reranking-tei:latest
-        imagePullPolicy: IfNotPresent
-        name: reranking-deploy
-        ports:
-        - containerPort: 8000
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: reranking-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: reranking-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 8000
-    targetPort: 8000
-  selector:
-    app: reranking-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: retriever-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: retriever-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: retriever-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/retriever-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: retriever-deploy
-        ports:
-        - containerPort: 7000
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: retriever-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: retriever-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 7000
-    targetPort: 7000
-  selector:
-    app: retriever-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: vector-db
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: vector-db
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: redis/redis-stack:7.2.0-v9
-        imagePullPolicy: IfNotPresent
-        name: vector-db
-        ports:
-        - containerPort: 6379
-        - containerPort: 8001
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: vector-db
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  ports:
-  - name: vector-db-service
-    port: 6379
-    targetPort: 6379
-  - name: vector-db-insight
-    port: 8001
-    targetPort: 8001
-  selector:
-    app: vector-db
-  type: ClusterIP
----
diff --git a/ChatQnA/benchmark/oob/without_rerank/four_gaudi/oob_four_gaudi_without_rerank.yaml b/ChatQnA/benchmark/oob/without_rerank/four_gaudi/oob_four_gaudi_without_rerank.yaml
deleted file mode 100644
index 6869a78f1..000000000
--- a/ChatQnA/benchmark/oob/without_rerank/four_gaudi/oob_four_gaudi_without_rerank.yaml
+++ /dev/null
@@ -1,730 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: qna-config
-  namespace: default
-data:
-  EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5
-  RERANK_MODEL_ID: BAAI/bge-reranker-base
-  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
-  TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006
-  TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808
-  TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009
-  REDIS_URL: redis://vector-db.default.svc.cluster.local:6379
-  INDEX_NAME: rag-redis
-  HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
-  EMBEDDING_SERVICE_HOST_IP: embedding-svc
-  RETRIEVER_SERVICE_HOST_IP: retriever-svc
-  RERANK_SERVICE_HOST_IP: reranking-svc
-  NODE_SELECTOR: chatqna-opea
-  LLM_SERVICE_HOST_IP: llm-svc
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: chatqna-backend-server-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: chatqna-backend-server-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: chatqna-backend-server-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: chatqna-backend-server-deploy
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/chatqna-without-rerank:latest
-        imagePullPolicy: IfNotPresent
-        name: chatqna-backend-server-deploy
-        args: null
-        ports:
-        - containerPort: 8888
-      serviceAccountName: default
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: chatqna-backend-server-svc
-  namespace: default
-spec:
-  type: NodePort
-  selector:
-    app: chatqna-backend-server-deploy
-  ports:
-  - name: service
-    port: 8888
-    targetPort: 8888
-    nodePort: 30888
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: dataprep-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: dataprep-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: dataprep-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: dataprep-deploy
-      hostIPC: true
-      containers:
-      - env:
-        - name: REDIS_URL
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: REDIS_URL
-        - name: TEI_ENDPOINT
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: TEI_EMBEDDING_ENDPOINT
-        - name: INDEX_NAME
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: INDEX_NAME
-        image: opea/dataprep-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: dataprep-deploy
-        args: null
-        ports:
-        - containerPort: 6007
-        - containerPort: 6008
-        - containerPort: 6009
-      serviceAccountName: default
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: dataprep-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: dataprep-deploy
-  ports:
-  - name: port1
-    port: 6007
-    targetPort: 6007
-  - name: port2
-    port: 6008
-    targetPort: 6008
-  - name: port3
-    port: 6009
-    targetPort: 6009
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-dependency-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: embedding-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-dependency-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-        name: embedding-dependency-deploy
-        args:
-        - --model-id
-        - $(EMBEDDING_MODEL_ID)
-        - --auto-truncate
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-        ports:
-        - containerPort: 80
-      serviceAccountName: default
-      volumes:
-      - name: model-volume
-        hostPath:
-          path: /mnt/models
-          type: Directory
-      - name: shm
-        emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: embedding-dependency-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: embedding-dependency-deploy
-  ports:
-  - name: service
-    port: 6006
-    targetPort: 80
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: embedding-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: embedding-deploy
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/embedding-tei:latest
-        imagePullPolicy: IfNotPresent
-        name: embedding-deploy
-        args: null
-        ports:
-        - containerPort: 6000
-      serviceAccountName: default
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: embedding-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: embedding-deploy
-  ports:
-  - name: service
-    port: 6000
-    targetPort: 6000
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-dependency-deploy
-  namespace: default
-spec:
-  replicas: 32
-  selector:
-    matchLabels:
-      app: llm-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-dependency-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/tgi-gaudi:2.0.1
-        name: llm-dependency-deploy-demo
-        securityContext:
-          capabilities:
-            add:
-            - SYS_NICE
-        args:
-        - --model-id
-        - $(LLM_MODEL_ID)
-        - --max-input-length
-        - '2048'
-        - --max-total-tokens
-        - '4096'
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            habana.ai/gaudi: 1
-        env:
-        - name: OMPI_MCA_btl_vader_single_copy_mechanism
-          value: none
-        - name: PT_HPU_ENABLE_LAZY_COLLECTIVES
-          value: 'true'
-        - name: runtime
-          value: habana
-        - name: HABANA_VISIBLE_DEVICES
-          value: all
-        - name: HF_TOKEN
-          value: ${HF_TOKEN}
-      serviceAccountName: default
-      volumes:
-      - name: model-volume
-        hostPath:
-          path: /mnt/models
-          type: Directory
-      - name: shm
-        emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: llm-dependency-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: llm-dependency-deploy
-  ports:
-  - name: service
-    port: 9009
-    targetPort: 80
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: llm-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: llm-deploy
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/llm-tgi:latest
-        imagePullPolicy: IfNotPresent
-        name: llm-deploy
-        args: null
-        ports:
-        - containerPort: 9000
-      serviceAccountName: default
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: llm-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: llm-deploy
-  ports:
-  - name: service
-    port: 9000
-    targetPort: 9000
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: reranking-dependency-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: reranking-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: reranking-dependency-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: reranking-dependency-deploy
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/tei-gaudi:latest
-        name: reranking-dependency-deploy
-        args:
-        - --model-id
-        - $(RERANK_MODEL_ID)
-        - --auto-truncate
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            habana.ai/gaudi: 1
-        env:
-        - name: OMPI_MCA_btl_vader_single_copy_mechanism
-          value: none
-        - name: PT_HPU_ENABLE_LAZY_COLLECTIVES
-          value: 'true'
-        - name: runtime
-          value: habana
-        - name: HABANA_VISIBLE_DEVICES
-          value: all
-        - name: HF_TOKEN
-          value: ${HF_TOKEN}
-        - name: MAX_WARMUP_SEQUENCE_LENGTH
-          value: '512'
-      serviceAccountName: default
-      volumes:
-      - name: model-volume
-        hostPath:
-          path: /mnt/models
-          type: Directory
-      - name: shm
-        emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: reranking-dependency-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: reranking-dependency-deploy
-  ports:
-  - name: service
-    port: 8808
-    targetPort: 80
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: reranking-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: reranking-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: reranking-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: reranking-deploy
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/reranking-tei:latest
-        imagePullPolicy: IfNotPresent
-        name: reranking-deploy
-        args: null
-        ports:
-        - containerPort: 8000
-      serviceAccountName: default
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: reranking-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: reranking-deploy
-  ports:
-  - name: service
-    port: 8000
-    targetPort: 8000
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: retriever-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: retriever-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: retriever-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: retriever-deploy
-      hostIPC: true
-      containers:
-      - env:
-        - name: REDIS_URL
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: REDIS_URL
-        - name: TEI_EMBEDDING_ENDPOINT
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: TEI_EMBEDDING_ENDPOINT
-        - name: HUGGINGFACEHUB_API_TOKEN
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: HUGGINGFACEHUB_API_TOKEN
-        - name: INDEX_NAME
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: INDEX_NAME
-        image: opea/retriever-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: retriever-deploy
-        args: null
-        ports:
-        - containerPort: 7000
-      serviceAccountName: default
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: retriever-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: retriever-deploy
-  ports:
-  - name: service
-    port: 7000
-    targetPort: 7000
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: vector-db
-  template:
-    metadata:
-      labels:
-        app: vector-db
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: vector-db
-      containers:
-      - name: vector-db
-        image: redis/redis-stack:7.2.0-v9
-        ports:
-        - containerPort: 6379
-        - containerPort: 8001
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: vector-db
-  ports:
-  - name: vector-db-service
-    port: 6379
-    targetPort: 6379
-  - name: vector-db-insight
-    port: 8001
-    targetPort: 8001
-
-
----
diff --git a/ChatQnA/benchmark/oob/without_rerank/single_gaudi/oob_single_gaudi_without_rerank.yaml b/ChatQnA/benchmark/oob/without_rerank/single_gaudi/oob_single_gaudi_without_rerank.yaml
deleted file mode 100644
index f38efbeb6..000000000
--- a/ChatQnA/benchmark/oob/without_rerank/single_gaudi/oob_single_gaudi_without_rerank.yaml
+++ /dev/null
@@ -1,579 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: qna-config
-  namespace: default
-data:
-  EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5
-  RERANK_MODEL_ID: BAAI/bge-reranker-base
-  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
-  TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006
-  TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808
-  TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009
-  REDIS_URL: redis://vector-db.default.svc.cluster.local:6379
-  INDEX_NAME: rag-redis
-  HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
-  EMBEDDING_SERVICE_HOST_IP: embedding-svc
-  RETRIEVER_SERVICE_HOST_IP: retriever-svc
-  RERANK_SERVICE_HOST_IP: reranking-svc
-  NODE_SELECTOR: chatqna-opea
-  LLM_SERVICE_HOST_IP: llm-svc
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: chatqna-backend-server-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: chatqna-backend-server-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: chatqna-backend-server-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: chatqna-backend-server-deploy
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/chatqna-without-rerank:latest
-        imagePullPolicy: IfNotPresent
-        name: chatqna-backend-server-deploy
-        args: null
-        ports:
-        - containerPort: 8888
-      serviceAccountName: default
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: chatqna-backend-server-svc
-  namespace: default
-spec:
-  type: NodePort
-  selector:
-    app: chatqna-backend-server-deploy
-  ports:
-  - name: service
-    port: 8888
-    targetPort: 8888
-    nodePort: 30888
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: dataprep-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: dataprep-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: dataprep-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: dataprep-deploy
-      hostIPC: true
-      containers:
-      - env:
-        - name: REDIS_URL
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: REDIS_URL
-        - name: TEI_ENDPOINT
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: TEI_EMBEDDING_ENDPOINT
-        - name: INDEX_NAME
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: INDEX_NAME
-        image: opea/dataprep-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: dataprep-deploy
-        args: null
-        ports:
-        - containerPort: 6007
-        - containerPort: 6008
-        - containerPort: 6009
-      serviceAccountName: default
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: dataprep-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: dataprep-deploy
-  ports:
-  - name: port1
-    port: 6007
-    targetPort: 6007
-  - name: port2
-    port: 6008
-    targetPort: 6008
-  - name: port3
-    port: 6009
-    targetPort: 6009
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-dependency-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: embedding-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-dependency-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-        name: embedding-dependency-deploy
-        args:
-        - --model-id
-        - $(EMBEDDING_MODEL_ID)
-        - --auto-truncate
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-        ports:
-        - containerPort: 80
-      serviceAccountName: default
-      volumes:
-      - name: model-volume
-        hostPath:
-          path: /mnt/models
-          type: Directory
-      - name: shm
-        emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: embedding-dependency-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: embedding-dependency-deploy
-  ports:
-  - name: service
-    port: 6006
-    targetPort: 80
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: embedding-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: embedding-deploy
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/embedding-tei:latest
-        imagePullPolicy: IfNotPresent
-        name: embedding-deploy
-        args: null
-        ports:
-        - containerPort: 6000
-      serviceAccountName: default
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: embedding-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: embedding-deploy
-  ports:
-  - name: service
-    port: 6000
-    targetPort: 6000
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-dependency-deploy
-  namespace: default
-spec:
-  replicas: 8
-  selector:
-    matchLabels:
-      app: llm-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-dependency-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/tgi-gaudi:2.0.1
-        name: llm-dependency-deploy-demo
-        securityContext:
-          capabilities:
-            add:
-            - SYS_NICE
-        args:
-        - --model-id
-        - $(LLM_MODEL_ID)
-        - --max-input-length
-        - '2048'
-        - --max-total-tokens
-        - '4096'
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            habana.ai/gaudi: 1
-        env:
-        - name: OMPI_MCA_btl_vader_single_copy_mechanism
-          value: none
-        - name: PT_HPU_ENABLE_LAZY_COLLECTIVES
-          value: 'true'
-        - name: runtime
-          value: habana
-        - name: HABANA_VISIBLE_DEVICES
-          value: all
-        - name: HF_TOKEN
-          value: ${HF_TOKEN}
-      serviceAccountName: default
-      volumes:
-      - name: model-volume
-        hostPath:
-          path: /mnt/models
-          type: Directory
-      - name: shm
-        emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: llm-dependency-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: llm-dependency-deploy
-  ports:
-  - name: service
-    port: 9009
-    targetPort: 80
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: llm-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: llm-deploy
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/llm-tgi:latest
-        imagePullPolicy: IfNotPresent
-        name: llm-deploy
-        args: null
-        ports:
-        - containerPort: 9000
-      serviceAccountName: default
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: llm-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: llm-deploy
-  ports:
-  - name: service
-    port: 9000
-    targetPort: 9000
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: retriever-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: retriever-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: retriever-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: retriever-deploy
-      hostIPC: true
-      containers:
-      - env:
-        - name: REDIS_URL
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: REDIS_URL
-        - name: TEI_EMBEDDING_ENDPOINT
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: TEI_EMBEDDING_ENDPOINT
-        - name: HUGGINGFACEHUB_API_TOKEN
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: HUGGINGFACEHUB_API_TOKEN
-        - name: INDEX_NAME
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: INDEX_NAME
-        image: opea/retriever-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: retriever-deploy
-        args: null
-        ports:
-        - containerPort: 7000
-      serviceAccountName: default
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: retriever-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: retriever-deploy
-  ports:
-  - name: service
-    port: 7000
-    targetPort: 7000
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: vector-db
-  template:
-    metadata:
-      labels:
-        app: vector-db
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: vector-db
-      containers:
-      - name: vector-db
-        image: redis/redis-stack:7.2.0-v9
-        ports:
-        - containerPort: 6379
-        - containerPort: 8001
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: vector-db
-  ports:
-  - name: vector-db-service
-    port: 6379
-    targetPort: 6379
-  - name: vector-db-insight
-    port: 8001
-    targetPort: 8001
-
-
----
diff --git a/ChatQnA/benchmark/oob/without_rerank/two_gaudi/oob_two_gaudi_without_rerank.yaml b/ChatQnA/benchmark/oob/without_rerank/two_gaudi/oob_two_gaudi_without_rerank.yaml
deleted file mode 100644
index a43553dda..000000000
--- a/ChatQnA/benchmark/oob/without_rerank/two_gaudi/oob_two_gaudi_without_rerank.yaml
+++ /dev/null
@@ -1,579 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: qna-config
-  namespace: default
-data:
-  EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5
-  RERANK_MODEL_ID: BAAI/bge-reranker-base
-  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
-  TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006
-  TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808
-  TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009
-  REDIS_URL: redis://vector-db.default.svc.cluster.local:6379
-  INDEX_NAME: rag-redis
-  HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
-  EMBEDDING_SERVICE_HOST_IP: embedding-svc
-  RETRIEVER_SERVICE_HOST_IP: retriever-svc
-  RERANK_SERVICE_HOST_IP: reranking-svc
-  NODE_SELECTOR: chatqna-opea
-  LLM_SERVICE_HOST_IP: llm-svc
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: chatqna-backend-server-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: chatqna-backend-server-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: chatqna-backend-server-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: chatqna-backend-server-deploy
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/chatqna-without-rerank:latest
-        imagePullPolicy: IfNotPresent
-        name: chatqna-backend-server-deploy
-        args: null
-        ports:
-        - containerPort: 8888
-      serviceAccountName: default
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: chatqna-backend-server-svc
-  namespace: default
-spec:
-  type: NodePort
-  selector:
-    app: chatqna-backend-server-deploy
-  ports:
-  - name: service
-    port: 8888
-    targetPort: 8888
-    nodePort: 30888
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: dataprep-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: dataprep-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: dataprep-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: dataprep-deploy
-      hostIPC: true
-      containers:
-      - env:
-        - name: REDIS_URL
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: REDIS_URL
-        - name: TEI_ENDPOINT
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: TEI_EMBEDDING_ENDPOINT
-        - name: INDEX_NAME
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: INDEX_NAME
-        image: opea/dataprep-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: dataprep-deploy
-        args: null
-        ports:
-        - containerPort: 6007
-        - containerPort: 6008
-        - containerPort: 6009
-      serviceAccountName: default
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: dataprep-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: dataprep-deploy
-  ports:
-  - name: port1
-    port: 6007
-    targetPort: 6007
-  - name: port2
-    port: 6008
-    targetPort: 6008
-  - name: port3
-    port: 6009
-    targetPort: 6009
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-dependency-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: embedding-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-dependency-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-        name: embedding-dependency-deploy
-        args:
-        - --model-id
-        - $(EMBEDDING_MODEL_ID)
-        - --auto-truncate
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-        ports:
-        - containerPort: 80
-      serviceAccountName: default
-      volumes:
-      - name: model-volume
-        hostPath:
-          path: /mnt/models
-          type: Directory
-      - name: shm
-        emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: embedding-dependency-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: embedding-dependency-deploy
-  ports:
-  - name: service
-    port: 6006
-    targetPort: 80
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: embedding-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: embedding-deploy
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/embedding-tei:latest
-        imagePullPolicy: IfNotPresent
-        name: embedding-deploy
-        args: null
-        ports:
-        - containerPort: 6000
-      serviceAccountName: default
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: embedding-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: embedding-deploy
-  ports:
-  - name: service
-    port: 6000
-    targetPort: 6000
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-dependency-deploy
-  namespace: default
-spec:
-  replicas: 16
-  selector:
-    matchLabels:
-      app: llm-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-dependency-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/tgi-gaudi:2.0.1
-        name: llm-dependency-deploy-demo
-        securityContext:
-          capabilities:
-            add:
-            - SYS_NICE
-        args:
-        - --model-id
-        - $(LLM_MODEL_ID)
-        - --max-input-length
-        - '2048'
-        - --max-total-tokens
-        - '4096'
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            habana.ai/gaudi: 1
-        env:
-        - name: OMPI_MCA_btl_vader_single_copy_mechanism
-          value: none
-        - name: PT_HPU_ENABLE_LAZY_COLLECTIVES
-          value: 'true'
-        - name: runtime
-          value: habana
-        - name: HABANA_VISIBLE_DEVICES
-          value: all
-        - name: HF_TOKEN
-          value: ${HF_TOKEN}
-      serviceAccountName: default
-      volumes:
-      - name: model-volume
-        hostPath:
-          path: /mnt/models
-          type: Directory
-      - name: shm
-        emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: llm-dependency-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: llm-dependency-deploy
-  ports:
-  - name: service
-    port: 9009
-    targetPort: 80
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: llm-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: llm-deploy
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/llm-tgi:latest
-        imagePullPolicy: IfNotPresent
-        name: llm-deploy
-        args: null
-        ports:
-        - containerPort: 9000
-      serviceAccountName: default
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: llm-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: llm-deploy
-  ports:
-  - name: service
-    port: 9000
-    targetPort: 9000
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: retriever-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: retriever-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: retriever-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: retriever-deploy
-      hostIPC: true
-      containers:
-      - env:
-        - name: REDIS_URL
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: REDIS_URL
-        - name: TEI_EMBEDDING_ENDPOINT
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: TEI_EMBEDDING_ENDPOINT
-        - name: HUGGINGFACEHUB_API_TOKEN
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: HUGGINGFACEHUB_API_TOKEN
-        - name: INDEX_NAME
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: INDEX_NAME
-        image: opea/retriever-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: retriever-deploy
-        args: null
-        ports:
-        - containerPort: 7000
-      serviceAccountName: default
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: retriever-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: retriever-deploy
-  ports:
-  - name: service
-    port: 7000
-    targetPort: 7000
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: vector-db
-  template:
-    metadata:
-      labels:
-        app: vector-db
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: vector-db
-      containers:
-      - name: vector-db
-        image: redis/redis-stack:7.2.0-v9
-        ports:
-        - containerPort: 6379
-        - containerPort: 8001
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: vector-db
-  ports:
-  - name: vector-db-service
-    port: 6379
-    targetPort: 6379
-  - name: vector-db-insight
-    port: 8001
-    targetPort: 8001
-
-
----
diff --git a/ChatQnA/benchmark/README.md b/ChatQnA/benchmark/performance/README.md
similarity index 95%
rename from ChatQnA/benchmark/README.md
rename to ChatQnA/benchmark/performance/README.md
index 68347a02d..9ca756028 100644
--- a/ChatQnA/benchmark/README.md
+++ b/ChatQnA/benchmark/performance/README.md
@@ -67,7 +67,7 @@ We have created the [BKC manifest](https://github.com/opea-project/GenAIExamples
 ```bash
 # on k8s-master node
 git clone https://github.com/opea-project/GenAIExamples.git
-cd GenAIExamples/ChatQnA/benchmark
+cd GenAIExamples/ChatQnA/benchmark/performance
 
 # replace the image tag from latest to v0.9 since we want to test with v0.9 release
 IMAGE_TAG=v0.9
@@ -148,7 +148,7 @@ Go to [BKC manifest](https://github.com/opea-project/GenAIExamples/tree/main/Cha
 
 ```bash
 # on k8s-master node
-cd GenAIExamples/ChatQnA/benchmark/tuned/with_rerank/single_gaudi
+cd GenAIExamples/ChatQnA/benchmark/performance/tuned/with_rerank/single_gaudi
 kubectl apply -f .
 ```
 
@@ -210,7 +210,7 @@ All the test results will come to this folder `/home/sdp/benchmark_output/node_1
 
 ```bash
 # on k8s-master node
-cd GenAIExamples/ChatQnA/benchmark/tuned/with_rerank/single_gaudi
+cd GenAIExamples/ChatQnA/benchmark/performance/tuned/with_rerank/single_gaudi
 kubectl delete -f .
 kubectl label nodes k8s-worker1 node-type-
 ```
@@ -231,7 +231,7 @@ Go to [BKC manifest](https://github.com/opea-project/GenAIExamples/tree/main/Cha
 
 ```bash
 # on k8s-master node
-cd GenAIExamples/ChatQnA/benchmark/tuned/with_rerank/two_gaudi
+cd GenAIExamples/ChatQnA/benchmark/performance/tuned/with_rerank/two_gaudi
 kubectl apply -f .
 ```
 
@@ -280,7 +280,7 @@ Go to [BKC manifest](https://github.com/opea-project/GenAIExamples/tree/main/Cha
 
 ```bash
 # on k8s-master node
-cd GenAIExamples/ChatQnA/benchmark/tuned/with_rerank/four_gaudi
+cd GenAIExamples/ChatQnA/benchmark/performance/tuned/with_rerank/four_gaudi
 kubectl apply -f .
 ```
 
@@ -309,7 +309,7 @@ All the test results will come to this folder `/home/sdp/benchmark_output/node_4
 
 ```bash
 # on k8s-master node
-cd GenAIExamples/ChatQnA/benchmark/tuned/with_rerank/single_gaudi
+cd GenAIExamples/ChatQnA/benchmark/performance/tuned/with_rerank/single_gaudi
 kubectl delete -f .
 kubectl label nodes k8s-master k8s-worker1 k8s-worker2 k8s-worker3 node-type-
 ```
diff --git a/ChatQnA/benchmark/benchmark.yaml b/ChatQnA/benchmark/performance/benchmark.yaml
similarity index 98%
rename from ChatQnA/benchmark/benchmark.yaml
rename to ChatQnA/benchmark/performance/benchmark.yaml
index f1eb86a37..851a3e11a 100644
--- a/ChatQnA/benchmark/benchmark.yaml
+++ b/ChatQnA/benchmark/performance/benchmark.yaml
@@ -41,7 +41,7 @@ test_cases:
       run_test: false
       service_name: "llm-svc"  # Replace with your service name
       parameters:
-        max_new_tokens: 128
+        max_tokens: 128
         temperature: 0.01
         top_k: 10
         top_p: 0.95
diff --git a/ChatQnA/benchmark/oob_no_wrapper/with_rerank/eight_gaudi/no_wrapper_oob_eight_gaudi_with_rerank.yaml b/ChatQnA/benchmark/performance/oob/with_rerank/eight_gaudi/no_wrapper_oob_eight_gaudi_with_rerank.yaml
similarity index 100%
rename from ChatQnA/benchmark/oob_no_wrapper/with_rerank/eight_gaudi/no_wrapper_oob_eight_gaudi_with_rerank.yaml
rename to ChatQnA/benchmark/performance/oob/with_rerank/eight_gaudi/no_wrapper_oob_eight_gaudi_with_rerank.yaml
diff --git a/ChatQnA/benchmark/oob_no_wrapper/with_rerank/four_gaudi/no_wrapper_oob_four_gaudi_with_rerank.yaml b/ChatQnA/benchmark/performance/oob/with_rerank/four_gaudi/no_wrapper_oob_four_gaudi_with_rerank.yaml
similarity index 100%
rename from ChatQnA/benchmark/oob_no_wrapper/with_rerank/four_gaudi/no_wrapper_oob_four_gaudi_with_rerank.yaml
rename to ChatQnA/benchmark/performance/oob/with_rerank/four_gaudi/no_wrapper_oob_four_gaudi_with_rerank.yaml
diff --git a/ChatQnA/benchmark/oob_no_wrapper/with_rerank/single_gaudi/no_wrapper_oob_single_gaudi_with_rerank.yaml b/ChatQnA/benchmark/performance/oob/with_rerank/single_gaudi/no_wrapper_oob_single_gaudi_with_rerank.yaml
similarity index 100%
rename from ChatQnA/benchmark/oob_no_wrapper/with_rerank/single_gaudi/no_wrapper_oob_single_gaudi_with_rerank.yaml
rename to ChatQnA/benchmark/performance/oob/with_rerank/single_gaudi/no_wrapper_oob_single_gaudi_with_rerank.yaml
diff --git a/ChatQnA/benchmark/oob_no_wrapper/with_rerank/two_gaudi/no_wrapper_oob_two_gaudi_with_rerank.yaml b/ChatQnA/benchmark/performance/oob/with_rerank/two_gaudi/no_wrapper_oob_two_gaudi_with_rerank.yaml
similarity index 100%
rename from ChatQnA/benchmark/oob_no_wrapper/with_rerank/two_gaudi/no_wrapper_oob_two_gaudi_with_rerank.yaml
rename to ChatQnA/benchmark/performance/oob/with_rerank/two_gaudi/no_wrapper_oob_two_gaudi_with_rerank.yaml
diff --git a/ChatQnA/benchmark/oob_no_wrapper/without_rerank/eight_gaudi/no_wrapper_oob_eight_gaudi_without_rerank.yaml b/ChatQnA/benchmark/performance/oob/without_rerank/eight_gaudi/no_wrapper_oob_eight_gaudi_without_rerank.yaml
similarity index 100%
rename from ChatQnA/benchmark/oob_no_wrapper/without_rerank/eight_gaudi/no_wrapper_oob_eight_gaudi_without_rerank.yaml
rename to ChatQnA/benchmark/performance/oob/without_rerank/eight_gaudi/no_wrapper_oob_eight_gaudi_without_rerank.yaml
diff --git a/ChatQnA/benchmark/oob_no_wrapper/without_rerank/four_gaudi/no_wrapper_oob_four_gaudi_without_rerank.yaml b/ChatQnA/benchmark/performance/oob/without_rerank/four_gaudi/no_wrapper_oob_four_gaudi_without_rerank.yaml
similarity index 100%
rename from ChatQnA/benchmark/oob_no_wrapper/without_rerank/four_gaudi/no_wrapper_oob_four_gaudi_without_rerank.yaml
rename to ChatQnA/benchmark/performance/oob/without_rerank/four_gaudi/no_wrapper_oob_four_gaudi_without_rerank.yaml
diff --git a/ChatQnA/benchmark/oob_no_wrapper/without_rerank/single_gaudi/no_wrapper_oob_single_gaudi_without_rerank.yaml b/ChatQnA/benchmark/performance/oob/without_rerank/single_gaudi/no_wrapper_oob_single_gaudi_without_rerank.yaml
similarity index 100%
rename from ChatQnA/benchmark/oob_no_wrapper/without_rerank/single_gaudi/no_wrapper_oob_single_gaudi_without_rerank.yaml
rename to ChatQnA/benchmark/performance/oob/without_rerank/single_gaudi/no_wrapper_oob_single_gaudi_without_rerank.yaml
diff --git a/ChatQnA/benchmark/oob_no_wrapper/without_rerank/two_gaudi/no_wrapper_oob_two_gaudi_without_rerank.yaml b/ChatQnA/benchmark/performance/oob/without_rerank/two_gaudi/no_wrapper_oob_two_gaudi_without_rerank.yaml
similarity index 100%
rename from ChatQnA/benchmark/oob_no_wrapper/without_rerank/two_gaudi/no_wrapper_oob_two_gaudi_without_rerank.yaml
rename to ChatQnA/benchmark/performance/oob/without_rerank/two_gaudi/no_wrapper_oob_two_gaudi_without_rerank.yaml
diff --git a/ChatQnA/benchmark/tuned_no_wrapper/with_rerank/eight_gaudi/no_wrapper_tuned_eight_gaudi_with_rerank.yaml b/ChatQnA/benchmark/performance/tuned/with_rerank/eight_gaudi/no_wrapper_tuned_eight_gaudi_with_rerank.yaml
similarity index 100%
rename from ChatQnA/benchmark/tuned_no_wrapper/with_rerank/eight_gaudi/no_wrapper_tuned_eight_gaudi_with_rerank.yaml
rename to ChatQnA/benchmark/performance/tuned/with_rerank/eight_gaudi/no_wrapper_tuned_eight_gaudi_with_rerank.yaml
diff --git a/ChatQnA/benchmark/tuned_no_wrapper/with_rerank/four_gaudi/no_wrapper_tuned_four_gaudi_with_rerank.yaml b/ChatQnA/benchmark/performance/tuned/with_rerank/four_gaudi/no_wrapper_tuned_four_gaudi_with_rerank.yaml
similarity index 100%
rename from ChatQnA/benchmark/tuned_no_wrapper/with_rerank/four_gaudi/no_wrapper_tuned_four_gaudi_with_rerank.yaml
rename to ChatQnA/benchmark/performance/tuned/with_rerank/four_gaudi/no_wrapper_tuned_four_gaudi_with_rerank.yaml
diff --git a/ChatQnA/benchmark/tuned_no_wrapper/with_rerank/single_gaudi/no_wrapper_tuned_single_gaudi_with_rerank.yaml b/ChatQnA/benchmark/performance/tuned/with_rerank/single_gaudi/no_wrapper_tuned_single_gaudi_with_rerank.yaml
similarity index 100%
rename from ChatQnA/benchmark/tuned_no_wrapper/with_rerank/single_gaudi/no_wrapper_tuned_single_gaudi_with_rerank.yaml
rename to ChatQnA/benchmark/performance/tuned/with_rerank/single_gaudi/no_wrapper_tuned_single_gaudi_with_rerank.yaml
diff --git a/ChatQnA/benchmark/tuned_no_wrapper/with_rerank/two_gaudi/no_wrapper_tuned_two_gaudi_with_rerank.yaml b/ChatQnA/benchmark/performance/tuned/with_rerank/two_gaudi/no_wrapper_tuned_two_gaudi_with_rerank.yaml
similarity index 100%
rename from ChatQnA/benchmark/tuned_no_wrapper/with_rerank/two_gaudi/no_wrapper_tuned_two_gaudi_with_rerank.yaml
rename to ChatQnA/benchmark/performance/tuned/with_rerank/two_gaudi/no_wrapper_tuned_two_gaudi_with_rerank.yaml
diff --git a/ChatQnA/benchmark/tuned_no_wrapper/without_rerank/eight_gaudi/no_wrapper_tuned_eight_gaudi_without_rerank.yaml b/ChatQnA/benchmark/performance/tuned/without_rerank/eight_gaudi/no_wrapper_tuned_eight_gaudi_without_rerank.yaml
similarity index 100%
rename from ChatQnA/benchmark/tuned_no_wrapper/without_rerank/eight_gaudi/no_wrapper_tuned_eight_gaudi_without_rerank.yaml
rename to ChatQnA/benchmark/performance/tuned/without_rerank/eight_gaudi/no_wrapper_tuned_eight_gaudi_without_rerank.yaml
diff --git a/ChatQnA/benchmark/tuned_no_wrapper/without_rerank/four_gaudi/no_wrapper_tuned_four_gaudi_without_rerank.yaml b/ChatQnA/benchmark/performance/tuned/without_rerank/four_gaudi/no_wrapper_tuned_four_gaudi_without_rerank.yaml
similarity index 100%
rename from ChatQnA/benchmark/tuned_no_wrapper/without_rerank/four_gaudi/no_wrapper_tuned_four_gaudi_without_rerank.yaml
rename to ChatQnA/benchmark/performance/tuned/without_rerank/four_gaudi/no_wrapper_tuned_four_gaudi_without_rerank.yaml
diff --git a/ChatQnA/benchmark/tuned_no_wrapper/without_rerank/single_gaudi/no_wrapper_tuned_single_gaudi_without_rerank.yaml b/ChatQnA/benchmark/performance/tuned/without_rerank/single_gaudi/no_wrapper_tuned_single_gaudi_without_rerank.yaml
similarity index 100%
rename from ChatQnA/benchmark/tuned_no_wrapper/without_rerank/single_gaudi/no_wrapper_tuned_single_gaudi_without_rerank.yaml
rename to ChatQnA/benchmark/performance/tuned/without_rerank/single_gaudi/no_wrapper_tuned_single_gaudi_without_rerank.yaml
diff --git a/ChatQnA/benchmark/tuned_no_wrapper/without_rerank/two_gaudi/no_wrapper_tuned_two_gaudi_without_rerank.yaml b/ChatQnA/benchmark/performance/tuned/without_rerank/two_gaudi/no_wrapper_tuned_two_gaudi_without_rerank.yaml
similarity index 100%
rename from ChatQnA/benchmark/tuned_no_wrapper/without_rerank/two_gaudi/no_wrapper_tuned_two_gaudi_without_rerank.yaml
rename to ChatQnA/benchmark/performance/tuned/without_rerank/two_gaudi/no_wrapper_tuned_two_gaudi_without_rerank.yaml
diff --git a/ChatQnA/benchmark/tuned/with_rerank/four_gaudi/tuned_four_gaudi_with_rerank.yaml b/ChatQnA/benchmark/tuned/with_rerank/four_gaudi/tuned_four_gaudi_with_rerank.yaml
deleted file mode 100644
index 1158bada9..000000000
--- a/ChatQnA/benchmark/tuned/with_rerank/four_gaudi/tuned_four_gaudi_with_rerank.yaml
+++ /dev/null
@@ -1,675 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-data:
-  EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5
-  EMBEDDING_SERVICE_HOST_IP: embedding-svc
-  HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
-  INDEX_NAME: rag-redis
-  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
-  LLM_SERVICE_HOST_IP: llm-svc
-  NODE_SELECTOR: chatqna-opea
-  REDIS_URL: redis://vector-db.default.svc.cluster.local:6379
-  RERANK_MODEL_ID: BAAI/bge-reranker-base
-  RERANK_SERVICE_HOST_IP: reranking-svc
-  RETRIEVER_SERVICE_HOST_IP: retriever-svc
-  TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006
-  TEI_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006
-  TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808
-  TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009
-kind: ConfigMap
-metadata:
-  name: qna-config
-  namespace: default
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: chatqna-backend-server-deploy
-  namespace: default
-spec:
-  replicas: 4
-  selector:
-    matchLabels:
-      app: chatqna-backend-server-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: chatqna-backend-server-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/chatqna:latest
-        imagePullPolicy: IfNotPresent
-        name: chatqna-backend-server-deploy
-        ports:
-        - containerPort: 8888
-        resources:
-          limits:
-            cpu: 8
-            memory: 8000Mi
-          requests:
-            cpu: 8
-            memory: 8000Mi
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: chatqna-backend-server-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: chatqna-backend-server-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    nodePort: 30888
-    port: 8888
-    targetPort: 8888
-  selector:
-    app: chatqna-backend-server-deploy
-  type: NodePort
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: dataprep-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: dataprep-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: dataprep-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/dataprep-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: dataprep-deploy
-        ports:
-        - containerPort: 6007
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: dataprep-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: dataprep-svc
-  namespace: default
-spec:
-  ports:
-  - name: port1
-    port: 6007
-    targetPort: 6007
-  selector:
-    app: dataprep-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-dependency-deploy
-  namespace: default
-spec:
-  replicas: 4
-  selector:
-    matchLabels:
-      app: embedding-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-dependency-deploy
-    spec:
-      containers:
-      - args:
-        - --model-id
-        - $(EMBEDDING_MODEL_ID)
-        - --auto-truncate
-        envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-        imagePullPolicy: IfNotPresent
-        name: embedding-dependency-deploy
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            cpu: 76
-            memory: 20000Mi
-          requests:
-            cpu: 76
-            memory: 20000Mi
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: embedding-dependency-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-      volumes:
-      - hostPath:
-          path: /mnt/models
-          type: Directory
-        name: model-volume
-      - emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
-        name: shm
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: embedding-dependency-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 6006
-    targetPort: 80
-  selector:
-    app: embedding-dependency-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-deploy
-  namespace: default
-spec:
-  replicas: 4
-  selector:
-    matchLabels:
-      app: embedding-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/embedding-tei:latest
-        imagePullPolicy: IfNotPresent
-        name: embedding-deploy
-        ports:
-        - containerPort: 6000
-        resources:
-          requests:
-            cpu: 4
-            memory: 4000Mi
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: embedding-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: embedding-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 6000
-    targetPort: 6000
-  selector:
-    app: embedding-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-dependency-deploy
-  namespace: default
-spec:
-  replicas: 31
-  selector:
-    matchLabels:
-      app: llm-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-dependency-deploy
-    spec:
-      containers:
-      - args:
-        - --model-id
-        - $(LLM_MODEL_ID)
-        - --max-input-length
-        - '1024'
-        - --max-total-tokens
-        - '2048'
-        - --max-batch-total-tokens
-        - '65536'
-        - --max-batch-prefill-tokens
-        - '4096'
-        env:
-        - name: OMPI_MCA_btl_vader_single_copy_mechanism
-          value: none
-        - name: PT_HPU_ENABLE_LAZY_COLLECTIVES
-          value: 'true'
-        - name: runtime
-          value: habana
-        - name: HABANA_VISIBLE_DEVICES
-          value: all
-        - name: HF_TOKEN
-          value: ${HF_TOKEN}
-        envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/tgi-gaudi:2.0.4
-        imagePullPolicy: IfNotPresent
-        name: llm-dependency-deploy
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            habana.ai/gaudi: 1
-        securityContext:
-          capabilities:
-            add:
-            - SYS_NICE
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: llm-dependency-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-      volumes:
-      - hostPath:
-          path: /mnt/models
-          type: Directory
-        name: model-volume
-      - emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
-        name: shm
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: llm-dependency-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 9009
-    targetPort: 80
-  selector:
-    app: llm-dependency-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-deploy
-  namespace: default
-spec:
-  replicas: 4
-  selector:
-    matchLabels:
-      app: llm-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/llm-tgi:latest
-        imagePullPolicy: IfNotPresent
-        name: llm-deploy
-        ports:
-        - containerPort: 9000
-        resources:
-          requests:
-            cpu: 4
-            memory: 4000Mi
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: llm-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: llm-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 9000
-    targetPort: 9000
-  selector:
-    app: llm-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: reranking-dependency-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: reranking-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: reranking-dependency-deploy
-    spec:
-      containers:
-      - args:
-        - --model-id
-        - $(RERANK_MODEL_ID)
-        - --auto-truncate
-        env:
-        - name: OMPI_MCA_btl_vader_single_copy_mechanism
-          value: none
-        - name: PT_HPU_ENABLE_LAZY_COLLECTIVES
-          value: 'true'
-        - name: runtime
-          value: habana
-        - name: HABANA_VISIBLE_DEVICES
-          value: all
-        - name: HF_TOKEN
-          value: ${HF_TOKEN}
-        - name: MAX_WARMUP_SEQUENCE_LENGTH
-          value: '512'
-        envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/tei-gaudi:latest
-        imagePullPolicy: IfNotPresent
-        name: reranking-dependency-deploy
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            habana.ai/gaudi: 1
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: reranking-dependency-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-      volumes:
-      - hostPath:
-          path: /mnt/models
-          type: Directory
-        name: model-volume
-      - emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
-        name: shm
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: reranking-dependency-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 8808
-    targetPort: 80
-  selector:
-    app: reranking-dependency-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: reranking-deploy
-  namespace: default
-spec:
-  replicas: 4
-  selector:
-    matchLabels:
-      app: reranking-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: reranking-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/reranking-tei:latest
-        imagePullPolicy: IfNotPresent
-        name: reranking-deploy
-        ports:
-        - containerPort: 8000
-        resources:
-          requests:
-            cpu: 4
-            memory: 4000Mi
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: reranking-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: reranking-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 8000
-    targetPort: 8000
-  selector:
-    app: reranking-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: retriever-deploy
-  namespace: default
-spec:
-  replicas: 4
-  selector:
-    matchLabels:
-      app: retriever-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: retriever-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/retriever-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: retriever-deploy
-        ports:
-        - containerPort: 7000
-        resources:
-          requests:
-            cpu: 4
-            memory: 4000Mi
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: retriever-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: retriever-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 7000
-    targetPort: 7000
-  selector:
-    app: retriever-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: vector-db
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: vector-db
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: redis/redis-stack:7.2.0-v9
-        imagePullPolicy: IfNotPresent
-        name: vector-db
-        ports:
-        - containerPort: 6379
-        - containerPort: 8001
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: vector-db
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  ports:
-  - name: vector-db-service
-    port: 6379
-    targetPort: 6379
-  - name: vector-db-insight
-    port: 8001
-    targetPort: 8001
-  selector:
-    app: vector-db
-  type: ClusterIP
----
diff --git a/ChatQnA/benchmark/tuned/with_rerank/single_gaudi/tuned_single_gaudi_with_rerank.yaml b/ChatQnA/benchmark/tuned/with_rerank/single_gaudi/tuned_single_gaudi_with_rerank.yaml
deleted file mode 100644
index e40977213..000000000
--- a/ChatQnA/benchmark/tuned/with_rerank/single_gaudi/tuned_single_gaudi_with_rerank.yaml
+++ /dev/null
@@ -1,675 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-data:
-  EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5
-  EMBEDDING_SERVICE_HOST_IP: embedding-svc
-  HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
-  INDEX_NAME: rag-redis
-  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
-  LLM_SERVICE_HOST_IP: llm-svc
-  NODE_SELECTOR: chatqna-opea
-  REDIS_URL: redis://vector-db.default.svc.cluster.local:6379
-  RERANK_MODEL_ID: BAAI/bge-reranker-base
-  RERANK_SERVICE_HOST_IP: reranking-svc
-  RETRIEVER_SERVICE_HOST_IP: retriever-svc
-  TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006
-  TEI_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006
-  TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808
-  TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009
-kind: ConfigMap
-metadata:
-  name: qna-config
-  namespace: default
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: chatqna-backend-server-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: chatqna-backend-server-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: chatqna-backend-server-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/chatqna:latest
-        imagePullPolicy: IfNotPresent
-        name: chatqna-backend-server-deploy
-        ports:
-        - containerPort: 8888
-        resources:
-          limits:
-            cpu: 8
-            memory: 8000Mi
-          requests:
-            cpu: 8
-            memory: 8000Mi
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: chatqna-backend-server-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: chatqna-backend-server-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    nodePort: 30888
-    port: 8888
-    targetPort: 8888
-  selector:
-    app: chatqna-backend-server-deploy
-  type: NodePort
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: dataprep-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: dataprep-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: dataprep-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/dataprep-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: dataprep-deploy
-        ports:
-        - containerPort: 6007
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: dataprep-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: dataprep-svc
-  namespace: default
-spec:
-  ports:
-  - name: port1
-    port: 6007
-    targetPort: 6007
-  selector:
-    app: dataprep-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-dependency-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: embedding-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-dependency-deploy
-    spec:
-      containers:
-      - args:
-        - --model-id
-        - $(EMBEDDING_MODEL_ID)
-        - --auto-truncate
-        envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-        imagePullPolicy: IfNotPresent
-        name: embedding-dependency-deploy
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            cpu: 76
-            memory: 20000Mi
-          requests:
-            cpu: 76
-            memory: 20000Mi
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: embedding-dependency-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-      volumes:
-      - hostPath:
-          path: /mnt/models
-          type: Directory
-        name: model-volume
-      - emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
-        name: shm
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: embedding-dependency-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 6006
-    targetPort: 80
-  selector:
-    app: embedding-dependency-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: embedding-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/embedding-tei:latest
-        imagePullPolicy: IfNotPresent
-        name: embedding-deploy
-        ports:
-        - containerPort: 6000
-        resources:
-          requests:
-            cpu: 4
-            memory: 4000Mi
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: embedding-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: embedding-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 6000
-    targetPort: 6000
-  selector:
-    app: embedding-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-dependency-deploy
-  namespace: default
-spec:
-  replicas: 7
-  selector:
-    matchLabels:
-      app: llm-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-dependency-deploy
-    spec:
-      containers:
-      - args:
-        - --model-id
-        - $(LLM_MODEL_ID)
-        - --max-input-length
-        - '1024'
-        - --max-total-tokens
-        - '2048'
-        - --max-batch-total-tokens
-        - '65536'
-        - --max-batch-prefill-tokens
-        - '4096'
-        env:
-        - name: OMPI_MCA_btl_vader_single_copy_mechanism
-          value: none
-        - name: PT_HPU_ENABLE_LAZY_COLLECTIVES
-          value: 'true'
-        - name: runtime
-          value: habana
-        - name: HABANA_VISIBLE_DEVICES
-          value: all
-        - name: HF_TOKEN
-          value: ${HF_TOKEN}
-        envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/tgi-gaudi:2.0.4
-        imagePullPolicy: IfNotPresent
-        name: llm-dependency-deploy
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            habana.ai/gaudi: 1
-        securityContext:
-          capabilities:
-            add:
-            - SYS_NICE
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: llm-dependency-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-      volumes:
-      - hostPath:
-          path: /mnt/models
-          type: Directory
-        name: model-volume
-      - emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
-        name: shm
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: llm-dependency-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 9009
-    targetPort: 80
-  selector:
-    app: llm-dependency-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: llm-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/llm-tgi:latest
-        imagePullPolicy: IfNotPresent
-        name: llm-deploy
-        ports:
-        - containerPort: 9000
-        resources:
-          requests:
-            cpu: 4
-            memory: 4000Mi
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: llm-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: llm-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 9000
-    targetPort: 9000
-  selector:
-    app: llm-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: reranking-dependency-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: reranking-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: reranking-dependency-deploy
-    spec:
-      containers:
-      - args:
-        - --model-id
-        - $(RERANK_MODEL_ID)
-        - --auto-truncate
-        env:
-        - name: OMPI_MCA_btl_vader_single_copy_mechanism
-          value: none
-        - name: PT_HPU_ENABLE_LAZY_COLLECTIVES
-          value: 'true'
-        - name: runtime
-          value: habana
-        - name: HABANA_VISIBLE_DEVICES
-          value: all
-        - name: HF_TOKEN
-          value: ${HF_TOKEN}
-        - name: MAX_WARMUP_SEQUENCE_LENGTH
-          value: '512'
-        envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/tei-gaudi:latest
-        imagePullPolicy: IfNotPresent
-        name: reranking-dependency-deploy
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            habana.ai/gaudi: 1
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: reranking-dependency-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-      volumes:
-      - hostPath:
-          path: /mnt/models
-          type: Directory
-        name: model-volume
-      - emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
-        name: shm
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: reranking-dependency-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 8808
-    targetPort: 80
-  selector:
-    app: reranking-dependency-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: reranking-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: reranking-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: reranking-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/reranking-tei:latest
-        imagePullPolicy: IfNotPresent
-        name: reranking-deploy
-        ports:
-        - containerPort: 8000
-        resources:
-          requests:
-            cpu: 4
-            memory: 4000Mi
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: reranking-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: reranking-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 8000
-    targetPort: 8000
-  selector:
-    app: reranking-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: retriever-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: retriever-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: retriever-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/retriever-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: retriever-deploy
-        ports:
-        - containerPort: 7000
-        resources:
-          requests:
-            cpu: 4
-            memory: 4000Mi
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: retriever-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: retriever-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 7000
-    targetPort: 7000
-  selector:
-    app: retriever-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: vector-db
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: vector-db
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: redis/redis-stack:7.2.0-v9
-        imagePullPolicy: IfNotPresent
-        name: vector-db
-        ports:
-        - containerPort: 6379
-        - containerPort: 8001
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: vector-db
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  ports:
-  - name: vector-db-service
-    port: 6379
-    targetPort: 6379
-  - name: vector-db-insight
-    port: 8001
-    targetPort: 8001
-  selector:
-    app: vector-db
-  type: ClusterIP
----
diff --git a/ChatQnA/benchmark/tuned/with_rerank/two_gaudi/tuned_two_gaudi_with_rerank.yaml b/ChatQnA/benchmark/tuned/with_rerank/two_gaudi/tuned_two_gaudi_with_rerank.yaml
deleted file mode 100644
index 2a54e1ca6..000000000
--- a/ChatQnA/benchmark/tuned/with_rerank/two_gaudi/tuned_two_gaudi_with_rerank.yaml
+++ /dev/null
@@ -1,675 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-data:
-  EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5
-  EMBEDDING_SERVICE_HOST_IP: embedding-svc
-  HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
-  INDEX_NAME: rag-redis
-  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
-  LLM_SERVICE_HOST_IP: llm-svc
-  NODE_SELECTOR: chatqna-opea
-  REDIS_URL: redis://vector-db.default.svc.cluster.local:6379
-  RERANK_MODEL_ID: BAAI/bge-reranker-base
-  RERANK_SERVICE_HOST_IP: reranking-svc
-  RETRIEVER_SERVICE_HOST_IP: retriever-svc
-  TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006
-  TEI_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006
-  TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808
-  TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009
-kind: ConfigMap
-metadata:
-  name: qna-config
-  namespace: default
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: chatqna-backend-server-deploy
-  namespace: default
-spec:
-  replicas: 2
-  selector:
-    matchLabels:
-      app: chatqna-backend-server-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: chatqna-backend-server-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/chatqna:latest
-        imagePullPolicy: IfNotPresent
-        name: chatqna-backend-server-deploy
-        ports:
-        - containerPort: 8888
-        resources:
-          limits:
-            cpu: 8
-            memory: 8000Mi
-          requests:
-            cpu: 8
-            memory: 8000Mi
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: chatqna-backend-server-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: chatqna-backend-server-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    nodePort: 30888
-    port: 8888
-    targetPort: 8888
-  selector:
-    app: chatqna-backend-server-deploy
-  type: NodePort
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: dataprep-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: dataprep-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: dataprep-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/dataprep-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: dataprep-deploy
-        ports:
-        - containerPort: 6007
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: dataprep-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: dataprep-svc
-  namespace: default
-spec:
-  ports:
-  - name: port1
-    port: 6007
-    targetPort: 6007
-  selector:
-    app: dataprep-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-dependency-deploy
-  namespace: default
-spec:
-  replicas: 2
-  selector:
-    matchLabels:
-      app: embedding-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-dependency-deploy
-    spec:
-      containers:
-      - args:
-        - --model-id
-        - $(EMBEDDING_MODEL_ID)
-        - --auto-truncate
-        envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-        imagePullPolicy: IfNotPresent
-        name: embedding-dependency-deploy
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            cpu: 76
-            memory: 20000Mi
-          requests:
-            cpu: 76
-            memory: 20000Mi
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: embedding-dependency-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-      volumes:
-      - hostPath:
-          path: /mnt/models
-          type: Directory
-        name: model-volume
-      - emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
-        name: shm
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: embedding-dependency-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 6006
-    targetPort: 80
-  selector:
-    app: embedding-dependency-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-deploy
-  namespace: default
-spec:
-  replicas: 2
-  selector:
-    matchLabels:
-      app: embedding-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/embedding-tei:latest
-        imagePullPolicy: IfNotPresent
-        name: embedding-deploy
-        ports:
-        - containerPort: 6000
-        resources:
-          requests:
-            cpu: 4
-            memory: 4000Mi
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: embedding-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: embedding-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 6000
-    targetPort: 6000
-  selector:
-    app: embedding-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-dependency-deploy
-  namespace: default
-spec:
-  replicas: 15
-  selector:
-    matchLabels:
-      app: llm-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-dependency-deploy
-    spec:
-      containers:
-      - args:
-        - --model-id
-        - $(LLM_MODEL_ID)
-        - --max-input-length
-        - '1024'
-        - --max-total-tokens
-        - '2048'
-        - --max-batch-total-tokens
-        - '65536'
-        - --max-batch-prefill-tokens
-        - '4096'
-        env:
-        - name: OMPI_MCA_btl_vader_single_copy_mechanism
-          value: none
-        - name: PT_HPU_ENABLE_LAZY_COLLECTIVES
-          value: 'true'
-        - name: runtime
-          value: habana
-        - name: HABANA_VISIBLE_DEVICES
-          value: all
-        - name: HF_TOKEN
-          value: ${HF_TOKEN}
-        envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/tgi-gaudi:2.0.4
-        imagePullPolicy: IfNotPresent
-        name: llm-dependency-deploy
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            habana.ai/gaudi: 1
-        securityContext:
-          capabilities:
-            add:
-            - SYS_NICE
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: llm-dependency-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-      volumes:
-      - hostPath:
-          path: /mnt/models
-          type: Directory
-        name: model-volume
-      - emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
-        name: shm
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: llm-dependency-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 9009
-    targetPort: 80
-  selector:
-    app: llm-dependency-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-deploy
-  namespace: default
-spec:
-  replicas: 2
-  selector:
-    matchLabels:
-      app: llm-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/llm-tgi:latest
-        imagePullPolicy: IfNotPresent
-        name: llm-deploy
-        ports:
-        - containerPort: 9000
-        resources:
-          requests:
-            cpu: 4
-            memory: 4000Mi
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: llm-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: llm-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 9000
-    targetPort: 9000
-  selector:
-    app: llm-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: reranking-dependency-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: reranking-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: reranking-dependency-deploy
-    spec:
-      containers:
-      - args:
-        - --model-id
-        - $(RERANK_MODEL_ID)
-        - --auto-truncate
-        env:
-        - name: OMPI_MCA_btl_vader_single_copy_mechanism
-          value: none
-        - name: PT_HPU_ENABLE_LAZY_COLLECTIVES
-          value: 'true'
-        - name: runtime
-          value: habana
-        - name: HABANA_VISIBLE_DEVICES
-          value: all
-        - name: HF_TOKEN
-          value: ${HF_TOKEN}
-        - name: MAX_WARMUP_SEQUENCE_LENGTH
-          value: '512'
-        envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/tei-gaudi:latest
-        imagePullPolicy: IfNotPresent
-        name: reranking-dependency-deploy
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            habana.ai/gaudi: 1
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: reranking-dependency-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-      volumes:
-      - hostPath:
-          path: /mnt/models
-          type: Directory
-        name: model-volume
-      - emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
-        name: shm
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: reranking-dependency-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 8808
-    targetPort: 80
-  selector:
-    app: reranking-dependency-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: reranking-deploy
-  namespace: default
-spec:
-  replicas: 2
-  selector:
-    matchLabels:
-      app: reranking-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: reranking-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/reranking-tei:latest
-        imagePullPolicy: IfNotPresent
-        name: reranking-deploy
-        ports:
-        - containerPort: 8000
-        resources:
-          requests:
-            cpu: 4
-            memory: 4000Mi
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: reranking-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: reranking-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 8000
-    targetPort: 8000
-  selector:
-    app: reranking-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: retriever-deploy
-  namespace: default
-spec:
-  replicas: 2
-  selector:
-    matchLabels:
-      app: retriever-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: retriever-deploy
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/retriever-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: retriever-deploy
-        ports:
-        - containerPort: 7000
-        resources:
-          requests:
-            cpu: 4
-            memory: 4000Mi
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: retriever-deploy
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: retriever-svc
-  namespace: default
-spec:
-  ports:
-  - name: service
-    port: 7000
-    targetPort: 7000
-  selector:
-    app: retriever-deploy
-  type: ClusterIP
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: vector-db
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: vector-db
-    spec:
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: redis/redis-stack:7.2.0-v9
-        imagePullPolicy: IfNotPresent
-        name: vector-db
-        ports:
-        - containerPort: 6379
-        - containerPort: 8001
-      hostIPC: true
-      nodeSelector:
-        node-type: chatqna-opea
-      serviceAccountName: default
-      topologySpreadConstraints:
-      - labelSelector:
-          matchLabels:
-            app: vector-db
-        maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  ports:
-  - name: vector-db-service
-    port: 6379
-    targetPort: 6379
-  - name: vector-db-insight
-    port: 8001
-    targetPort: 8001
-  selector:
-    app: vector-db
-  type: ClusterIP
----
diff --git a/ChatQnA/benchmark/tuned/without_rerank/four_gaudi/tuned_four_gaudi_without_rerank.yaml b/ChatQnA/benchmark/tuned/without_rerank/four_gaudi/tuned_four_gaudi_without_rerank.yaml
deleted file mode 100644
index ad0d8ec55..000000000
--- a/ChatQnA/benchmark/tuned/without_rerank/four_gaudi/tuned_four_gaudi_without_rerank.yaml
+++ /dev/null
@@ -1,614 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: qna-config
-  namespace: default
-data:
-  EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5
-  RERANK_MODEL_ID: BAAI/bge-reranker-base
-  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
-  TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006
-  TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808
-  TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009
-  REDIS_URL: redis://vector-db.default.svc.cluster.local:6379
-  INDEX_NAME: rag-redis
-  HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
-  EMBEDDING_SERVICE_HOST_IP: embedding-svc
-  RETRIEVER_SERVICE_HOST_IP: retriever-svc
-  RERANK_SERVICE_HOST_IP: reranking-svc
-  NODE_SELECTOR: chatqna-opea
-  LLM_SERVICE_HOST_IP: llm-svc
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: chatqna-backend-server-deploy
-  namespace: default
-spec:
-  replicas: 4
-  selector:
-    matchLabels:
-      app: chatqna-backend-server-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: chatqna-backend-server-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: chatqna-backend-server-deploy
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/chatqna-without-rerank:latest
-        imagePullPolicy: IfNotPresent
-        name: chatqna-backend-server-deploy
-        args: null
-        ports:
-        - containerPort: 8888
-        resources:
-          limits:
-            cpu: 8
-            memory: 4000Mi
-          requests:
-            cpu: 8
-            memory: 4000Mi
-      serviceAccountName: default
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: chatqna-backend-server-svc
-  namespace: default
-spec:
-  type: NodePort
-  selector:
-    app: chatqna-backend-server-deploy
-  ports:
-  - name: service
-    port: 8888
-    targetPort: 8888
-    nodePort: 30888
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: dataprep-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: dataprep-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: dataprep-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: dataprep-deploy
-      hostIPC: true
-      containers:
-      - env:
-        - name: REDIS_URL
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: REDIS_URL
-        - name: TEI_ENDPOINT
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: TEI_EMBEDDING_ENDPOINT
-        - name: INDEX_NAME
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: INDEX_NAME
-        image: opea/dataprep-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: dataprep-deploy
-        args: null
-        ports:
-        - containerPort: 6007
-        - containerPort: 6008
-        - containerPort: 6009
-      serviceAccountName: default
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: dataprep-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: dataprep-deploy
-  ports:
-  - name: port1
-    port: 6007
-    targetPort: 6007
-  - name: port2
-    port: 6008
-    targetPort: 6008
-  - name: port3
-    port: 6009
-    targetPort: 6009
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-dependency-deploy
-  namespace: default
-spec:
-  replicas: 4
-  selector:
-    matchLabels:
-      app: embedding-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-dependency-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-        name: embedding-dependency-deploy
-        args:
-        - --model-id
-        - $(EMBEDDING_MODEL_ID)
-        - --auto-truncate
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            cpu: 76
-            memory: 20000Mi
-          requests:
-            cpu: 76
-            memory: 20000Mi
-      serviceAccountName: default
-      volumes:
-      - name: model-volume
-        hostPath:
-          path: /mnt/models
-          type: Directory
-      - name: shm
-        emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: embedding-dependency-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: embedding-dependency-deploy
-  ports:
-  - name: service
-    port: 6006
-    targetPort: 80
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-deploy
-  namespace: default
-spec:
-  replicas: 4
-  selector:
-    matchLabels:
-      app: embedding-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: embedding-deploy
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/embedding-tei:latest
-        imagePullPolicy: IfNotPresent
-        name: embedding-deploy
-        args: null
-        ports:
-        - containerPort: 6000
-        resources:
-          limits:
-            cpu: 4
-          requests:
-            cpu: 4
-      serviceAccountName: default
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: embedding-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: embedding-deploy
-  ports:
-  - name: service
-    port: 6000
-    targetPort: 6000
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-dependency-deploy
-  namespace: default
-spec:
-  replicas: 32
-  selector:
-    matchLabels:
-      app: llm-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-dependency-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/tgi-gaudi:2.0.4
-        name: llm-dependency-deploy-demo
-        securityContext:
-          capabilities:
-            add:
-            - SYS_NICE
-        args:
-        - --model-id
-        - $(LLM_MODEL_ID)
-        - --max-input-length
-        - '1024'
-        - --max-total-tokens
-        - '2048'
-        - --max-batch-total-tokens
-        - '65536'
-        - --max-batch-prefill-tokens
-        - '4096'
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            habana.ai/gaudi: 1
-        env:
-        - name: OMPI_MCA_btl_vader_single_copy_mechanism
-          value: none
-        - name: PT_HPU_ENABLE_LAZY_COLLECTIVES
-          value: 'true'
-        - name: runtime
-          value: habana
-        - name: HABANA_VISIBLE_DEVICES
-          value: all
-        - name: HF_TOKEN
-          value: ${HF_TOKEN}
-      serviceAccountName: default
-      volumes:
-      - name: model-volume
-        hostPath:
-          path: /mnt/models
-          type: Directory
-      - name: shm
-        emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: llm-dependency-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: llm-dependency-deploy
-  ports:
-  - name: service
-    port: 9009
-    targetPort: 80
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-deploy
-  namespace: default
-spec:
-  replicas: 4
-  selector:
-    matchLabels:
-      app: llm-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: llm-deploy
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/llm-tgi:latest
-        imagePullPolicy: IfNotPresent
-        name: llm-deploy
-        args: null
-        ports:
-        - containerPort: 9000
-        resources:
-          limits:
-            cpu: 4
-          requests:
-            cpu: 4
-      serviceAccountName: default
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: llm-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: llm-deploy
-  ports:
-  - name: service
-    port: 9000
-    targetPort: 9000
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: retriever-deploy
-  namespace: default
-spec:
-  replicas: 4
-  selector:
-    matchLabels:
-      app: retriever-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: retriever-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: retriever-deploy
-      hostIPC: true
-      containers:
-      - env:
-        - name: REDIS_URL
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: REDIS_URL
-        - name: TEI_EMBEDDING_ENDPOINT
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: TEI_EMBEDDING_ENDPOINT
-        - name: HUGGINGFACEHUB_API_TOKEN
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: HUGGINGFACEHUB_API_TOKEN
-        - name: INDEX_NAME
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: INDEX_NAME
-        image: opea/retriever-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: retriever-deploy
-        args: null
-        ports:
-        - containerPort: 7000
-        resources:
-          limits:
-            cpu: 8
-            memory: 2500Mi
-          requests:
-            cpu: 8
-            memory: 2500Mi
-      serviceAccountName: default
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: retriever-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: retriever-deploy
-  ports:
-  - name: service
-    port: 7000
-    targetPort: 7000
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: vector-db
-  template:
-    metadata:
-      labels:
-        app: vector-db
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: vector-db
-      containers:
-      - name: vector-db
-        image: redis/redis-stack:7.2.0-v9
-        ports:
-        - containerPort: 6379
-        - containerPort: 8001
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: vector-db
-  ports:
-  - name: vector-db-service
-    port: 6379
-    targetPort: 6379
-  - name: vector-db-insight
-    port: 8001
-    targetPort: 8001
-
-
----
diff --git a/ChatQnA/benchmark/tuned/without_rerank/single_gaudi/tuned_single_gaudi_without_rerank.yaml b/ChatQnA/benchmark/tuned/without_rerank/single_gaudi/tuned_single_gaudi_without_rerank.yaml
deleted file mode 100644
index 0a2bdd525..000000000
--- a/ChatQnA/benchmark/tuned/without_rerank/single_gaudi/tuned_single_gaudi_without_rerank.yaml
+++ /dev/null
@@ -1,614 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: qna-config
-  namespace: default
-data:
-  EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5
-  RERANK_MODEL_ID: BAAI/bge-reranker-base
-  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
-  TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006
-  TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808
-  TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009
-  REDIS_URL: redis://vector-db.default.svc.cluster.local:6379
-  INDEX_NAME: rag-redis
-  HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
-  EMBEDDING_SERVICE_HOST_IP: embedding-svc
-  RETRIEVER_SERVICE_HOST_IP: retriever-svc
-  RERANK_SERVICE_HOST_IP: reranking-svc
-  NODE_SELECTOR: chatqna-opea
-  LLM_SERVICE_HOST_IP: llm-svc
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: chatqna-backend-server-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: chatqna-backend-server-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: chatqna-backend-server-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: chatqna-backend-server-deploy
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/chatqna-without-rerank:latest
-        imagePullPolicy: IfNotPresent
-        name: chatqna-backend-server-deploy
-        args: null
-        ports:
-        - containerPort: 8888
-        resources:
-          limits:
-            cpu: 8
-            memory: 4000Mi
-          requests:
-            cpu: 8
-            memory: 4000Mi
-      serviceAccountName: default
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: chatqna-backend-server-svc
-  namespace: default
-spec:
-  type: NodePort
-  selector:
-    app: chatqna-backend-server-deploy
-  ports:
-  - name: service
-    port: 8888
-    targetPort: 8888
-    nodePort: 30888
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: dataprep-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: dataprep-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: dataprep-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: dataprep-deploy
-      hostIPC: true
-      containers:
-      - env:
-        - name: REDIS_URL
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: REDIS_URL
-        - name: TEI_ENDPOINT
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: TEI_EMBEDDING_ENDPOINT
-        - name: INDEX_NAME
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: INDEX_NAME
-        image: opea/dataprep-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: dataprep-deploy
-        args: null
-        ports:
-        - containerPort: 6007
-        - containerPort: 6008
-        - containerPort: 6009
-      serviceAccountName: default
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: dataprep-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: dataprep-deploy
-  ports:
-  - name: port1
-    port: 6007
-    targetPort: 6007
-  - name: port2
-    port: 6008
-    targetPort: 6008
-  - name: port3
-    port: 6009
-    targetPort: 6009
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-dependency-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: embedding-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-dependency-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-        name: embedding-dependency-deploy
-        args:
-        - --model-id
-        - $(EMBEDDING_MODEL_ID)
-        - --auto-truncate
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            cpu: 76
-            memory: 20000Mi
-          requests:
-            cpu: 76
-            memory: 20000Mi
-      serviceAccountName: default
-      volumes:
-      - name: model-volume
-        hostPath:
-          path: /mnt/models
-          type: Directory
-      - name: shm
-        emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: embedding-dependency-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: embedding-dependency-deploy
-  ports:
-  - name: service
-    port: 6006
-    targetPort: 80
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: embedding-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: embedding-deploy
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/embedding-tei:latest
-        imagePullPolicy: IfNotPresent
-        name: embedding-deploy
-        args: null
-        ports:
-        - containerPort: 6000
-        resources:
-          limits:
-            cpu: 4
-          requests:
-            cpu: 4
-      serviceAccountName: default
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: embedding-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: embedding-deploy
-  ports:
-  - name: service
-    port: 6000
-    targetPort: 6000
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-dependency-deploy
-  namespace: default
-spec:
-  replicas: 8
-  selector:
-    matchLabels:
-      app: llm-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-dependency-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/tgi-gaudi:2.0.4
-        name: llm-dependency-deploy-demo
-        securityContext:
-          capabilities:
-            add:
-            - SYS_NICE
-        args:
-        - --model-id
-        - $(LLM_MODEL_ID)
-        - --max-input-length
-        - '1024'
-        - --max-total-tokens
-        - '2048'
-        - --max-batch-total-tokens
-        - '65536'
-        - --max-batch-prefill-tokens
-        - '4096'
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            habana.ai/gaudi: 1
-        env:
-        - name: OMPI_MCA_btl_vader_single_copy_mechanism
-          value: none
-        - name: PT_HPU_ENABLE_LAZY_COLLECTIVES
-          value: 'true'
-        - name: runtime
-          value: habana
-        - name: HABANA_VISIBLE_DEVICES
-          value: all
-        - name: HF_TOKEN
-          value: ${HF_TOKEN}
-      serviceAccountName: default
-      volumes:
-      - name: model-volume
-        hostPath:
-          path: /mnt/models
-          type: Directory
-      - name: shm
-        emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: llm-dependency-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: llm-dependency-deploy
-  ports:
-  - name: service
-    port: 9009
-    targetPort: 80
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: llm-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: llm-deploy
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/llm-tgi:latest
-        imagePullPolicy: IfNotPresent
-        name: llm-deploy
-        args: null
-        ports:
-        - containerPort: 9000
-        resources:
-          limits:
-            cpu: 4
-          requests:
-            cpu: 4
-      serviceAccountName: default
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: llm-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: llm-deploy
-  ports:
-  - name: service
-    port: 9000
-    targetPort: 9000
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: retriever-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: retriever-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: retriever-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: retriever-deploy
-      hostIPC: true
-      containers:
-      - env:
-        - name: REDIS_URL
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: REDIS_URL
-        - name: TEI_EMBEDDING_ENDPOINT
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: TEI_EMBEDDING_ENDPOINT
-        - name: HUGGINGFACEHUB_API_TOKEN
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: HUGGINGFACEHUB_API_TOKEN
-        - name: INDEX_NAME
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: INDEX_NAME
-        image: opea/retriever-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: retriever-deploy
-        args: null
-        ports:
-        - containerPort: 7000
-        resources:
-          limits:
-            cpu: 8
-            memory: 2500Mi
-          requests:
-            cpu: 8
-            memory: 2500Mi
-      serviceAccountName: default
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: retriever-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: retriever-deploy
-  ports:
-  - name: service
-    port: 7000
-    targetPort: 7000
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: vector-db
-  template:
-    metadata:
-      labels:
-        app: vector-db
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: vector-db
-      containers:
-      - name: vector-db
-        image: redis/redis-stack:7.2.0-v9
-        ports:
-        - containerPort: 6379
-        - containerPort: 8001
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: vector-db
-  ports:
-  - name: vector-db-service
-    port: 6379
-    targetPort: 6379
-  - name: vector-db-insight
-    port: 8001
-    targetPort: 8001
-
-
----
diff --git a/ChatQnA/benchmark/tuned/without_rerank/two_gaudi/tuned_two_gaudi_without_rerank.yaml b/ChatQnA/benchmark/tuned/without_rerank/two_gaudi/tuned_two_gaudi_without_rerank.yaml
deleted file mode 100644
index 9a4554d9f..000000000
--- a/ChatQnA/benchmark/tuned/without_rerank/two_gaudi/tuned_two_gaudi_without_rerank.yaml
+++ /dev/null
@@ -1,614 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: qna-config
-  namespace: default
-data:
-  EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5
-  RERANK_MODEL_ID: BAAI/bge-reranker-base
-  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
-  TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006
-  TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808
-  TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009
-  REDIS_URL: redis://vector-db.default.svc.cluster.local:6379
-  INDEX_NAME: rag-redis
-  HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
-  EMBEDDING_SERVICE_HOST_IP: embedding-svc
-  RETRIEVER_SERVICE_HOST_IP: retriever-svc
-  RERANK_SERVICE_HOST_IP: reranking-svc
-  NODE_SELECTOR: chatqna-opea
-  LLM_SERVICE_HOST_IP: llm-svc
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: chatqna-backend-server-deploy
-  namespace: default
-spec:
-  replicas: 2
-  selector:
-    matchLabels:
-      app: chatqna-backend-server-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: chatqna-backend-server-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: chatqna-backend-server-deploy
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/chatqna-without-rerank:latest
-        imagePullPolicy: IfNotPresent
-        name: chatqna-backend-server-deploy
-        args: null
-        ports:
-        - containerPort: 8888
-        resources:
-          limits:
-            cpu: 8
-            memory: 4000Mi
-          requests:
-            cpu: 8
-            memory: 4000Mi
-      serviceAccountName: default
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: chatqna-backend-server-svc
-  namespace: default
-spec:
-  type: NodePort
-  selector:
-    app: chatqna-backend-server-deploy
-  ports:
-  - name: service
-    port: 8888
-    targetPort: 8888
-    nodePort: 30888
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: dataprep-deploy
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: dataprep-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: dataprep-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: dataprep-deploy
-      hostIPC: true
-      containers:
-      - env:
-        - name: REDIS_URL
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: REDIS_URL
-        - name: TEI_ENDPOINT
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: TEI_EMBEDDING_ENDPOINT
-        - name: INDEX_NAME
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: INDEX_NAME
-        image: opea/dataprep-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: dataprep-deploy
-        args: null
-        ports:
-        - containerPort: 6007
-        - containerPort: 6008
-        - containerPort: 6009
-      serviceAccountName: default
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: dataprep-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: dataprep-deploy
-  ports:
-  - name: port1
-    port: 6007
-    targetPort: 6007
-  - name: port2
-    port: 6008
-    targetPort: 6008
-  - name: port3
-    port: 6009
-    targetPort: 6009
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-dependency-deploy
-  namespace: default
-spec:
-  replicas: 2
-  selector:
-    matchLabels:
-      app: embedding-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-dependency-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-        name: embedding-dependency-deploy
-        args:
-        - --model-id
-        - $(EMBEDDING_MODEL_ID)
-        - --auto-truncate
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            cpu: 76
-            memory: 20000Mi
-          requests:
-            cpu: 76
-            memory: 20000Mi
-      serviceAccountName: default
-      volumes:
-      - name: model-volume
-        hostPath:
-          path: /mnt/models
-          type: Directory
-      - name: shm
-        emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: embedding-dependency-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: embedding-dependency-deploy
-  ports:
-  - name: service
-    port: 6006
-    targetPort: 80
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: embedding-deploy
-  namespace: default
-spec:
-  replicas: 2
-  selector:
-    matchLabels:
-      app: embedding-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: embedding-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: embedding-deploy
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/embedding-tei:latest
-        imagePullPolicy: IfNotPresent
-        name: embedding-deploy
-        args: null
-        ports:
-        - containerPort: 6000
-        resources:
-          limits:
-            cpu: 4
-          requests:
-            cpu: 4
-      serviceAccountName: default
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: embedding-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: embedding-deploy
-  ports:
-  - name: service
-    port: 6000
-    targetPort: 6000
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-dependency-deploy
-  namespace: default
-spec:
-  replicas: 16
-  selector:
-    matchLabels:
-      app: llm-dependency-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-dependency-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: ghcr.io/huggingface/tgi-gaudi:2.0.4
-        name: llm-dependency-deploy-demo
-        securityContext:
-          capabilities:
-            add:
-            - SYS_NICE
-        args:
-        - --model-id
-        - $(LLM_MODEL_ID)
-        - --max-input-length
-        - '1024'
-        - --max-total-tokens
-        - '2048'
-        - --max-batch-total-tokens
-        - '65536'
-        - --max-batch-prefill-tokens
-        - '4096'
-        volumeMounts:
-        - mountPath: /data
-          name: model-volume
-        - mountPath: /dev/shm
-          name: shm
-        ports:
-        - containerPort: 80
-        resources:
-          limits:
-            habana.ai/gaudi: 1
-        env:
-        - name: OMPI_MCA_btl_vader_single_copy_mechanism
-          value: none
-        - name: PT_HPU_ENABLE_LAZY_COLLECTIVES
-          value: 'true'
-        - name: runtime
-          value: habana
-        - name: HABANA_VISIBLE_DEVICES
-          value: all
-        - name: HF_TOKEN
-          value: ${HF_TOKEN}
-      serviceAccountName: default
-      volumes:
-      - name: model-volume
-        hostPath:
-          path: /mnt/models
-          type: Directory
-      - name: shm
-        emptyDir:
-          medium: Memory
-          sizeLimit: 1Gi
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: llm-dependency-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: llm-dependency-deploy
-  ports:
-  - name: service
-    port: 9009
-    targetPort: 80
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-deploy
-  namespace: default
-spec:
-  replicas: 2
-  selector:
-    matchLabels:
-      app: llm-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: llm-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: llm-deploy
-      hostIPC: true
-      containers:
-      - envFrom:
-        - configMapRef:
-            name: qna-config
-        image: opea/llm-tgi:latest
-        imagePullPolicy: IfNotPresent
-        name: llm-deploy
-        args: null
-        ports:
-        - containerPort: 9000
-        resources:
-          limits:
-            cpu: 4
-          requests:
-            cpu: 4
-      serviceAccountName: default
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: llm-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: llm-deploy
-  ports:
-  - name: service
-    port: 9000
-    targetPort: 9000
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: retriever-deploy
-  namespace: default
-spec:
-  replicas: 2
-  selector:
-    matchLabels:
-      app: retriever-deploy
-  template:
-    metadata:
-      annotations:
-        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
-      labels:
-        app: retriever-deploy
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: retriever-deploy
-      hostIPC: true
-      containers:
-      - env:
-        - name: REDIS_URL
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: REDIS_URL
-        - name: TEI_EMBEDDING_ENDPOINT
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: TEI_EMBEDDING_ENDPOINT
-        - name: HUGGINGFACEHUB_API_TOKEN
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: HUGGINGFACEHUB_API_TOKEN
-        - name: INDEX_NAME
-          valueFrom:
-            configMapKeyRef:
-              name: qna-config
-              key: INDEX_NAME
-        image: opea/retriever-redis:latest
-        imagePullPolicy: IfNotPresent
-        name: retriever-deploy
-        args: null
-        ports:
-        - containerPort: 7000
-        resources:
-          limits:
-            cpu: 8
-            memory: 2500Mi
-          requests:
-            cpu: 8
-            memory: 2500Mi
-      serviceAccountName: default
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: retriever-svc
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: retriever-deploy
-  ports:
-  - name: service
-    port: 7000
-    targetPort: 7000
-
-
----
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: vector-db
-  template:
-    metadata:
-      labels:
-        app: vector-db
-    spec:
-      nodeSelector:
-        node-type: chatqna-opea
-      topologySpreadConstraints:
-      - maxSkew: 1
-        topologyKey: kubernetes.io/hostname
-        whenUnsatisfiable: ScheduleAnyway
-        labelSelector:
-          matchLabels:
-            app: vector-db
-      containers:
-      - name: vector-db
-        image: redis/redis-stack:7.2.0-v9
-        ports:
-        - containerPort: 6379
-        - containerPort: 8001
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: vector-db
-  namespace: default
-spec:
-  type: ClusterIP
-  selector:
-    app: vector-db
-  ports:
-  - name: vector-db-service
-    port: 6379
-    targetPort: 6379
-  - name: vector-db-insight
-    port: 8001
-    targetPort: 8001
-
-
----
diff --git a/ChatQnA/chatqna_no_wrapper.py b/ChatQnA/chatqna_no_wrapper.py
index 2780c7486..c08c6a2f3 100644
--- a/ChatQnA/chatqna_no_wrapper.py
+++ b/ChatQnA/chatqna_no_wrapper.py
@@ -69,10 +69,12 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k
         next_inputs = {}
         next_inputs["model"] = "tgi"  # specifically clarify the fake model to make the format unified
         next_inputs["messages"] = [{"role": "user", "content": inputs["inputs"]}]
-        next_inputs["max_tokens"] = llm_parameters_dict["max_new_tokens"]
+        next_inputs["max_tokens"] = llm_parameters_dict["max_tokens"]
         next_inputs["top_p"] = llm_parameters_dict["top_p"]
         next_inputs["stream"] = inputs["streaming"]
-        next_inputs["frequency_penalty"] = inputs["repetition_penalty"]
+        next_inputs["frequency_penalty"] = inputs["frequency_penalty"]
+        next_inputs["presence_penalty"] = inputs["presence_penalty"]
+        next_inputs["repetition_penalty"] = inputs["repetition_penalty"]
         next_inputs["temperature"] = inputs["temperature"]
         inputs = next_inputs
 
diff --git a/ChatQnA/docker_compose/intel/cpu/aipc/README.md b/ChatQnA/docker_compose/intel/cpu/aipc/README.md
index 3c28d1c10..9b13d8185 100644
--- a/ChatQnA/docker_compose/intel/cpu/aipc/README.md
+++ b/ChatQnA/docker_compose/intel/cpu/aipc/README.md
@@ -229,7 +229,7 @@ OLLAMA_HOST=${host_ip}:11434 ollama run $OLLAMA_MODEL
    ```bash
    curl http://${host_ip}:9000/v1/chat/completions\
      -X POST \
-     -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
+     -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
      -H 'Content-Type: application/json'
    ```
 
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/README.md b/ChatQnA/docker_compose/intel/cpu/xeon/README.md
index 7eb75431a..5eca0d284 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/README.md
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/README.md
@@ -438,18 +438,31 @@ docker compose -f compose_vllm.yaml up -d
    This service depends on above LLM backend service startup. It will be ready after long time, to wait for them being ready in first startup.
 
    ```bash
+   # TGI service
    curl http://${host_ip}:9000/v1/chat/completions\
      -X POST \
-     -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
+     -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
      -H 'Content-Type: application/json'
    ```
 
+   For parameters in TGI modes, please refer to [HuggingFace InferenceClient API](https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.text_generation) (except we rename "max_new_tokens" to "max_tokens".)
+
+   ```bash
+   # vLLM Service
+   curl http://${your_ip}:9000/v1/chat/completions \
+    -X POST \
+    -d '{"query":"What is Deep Learning?","max_tokens":17,"top_p":1,"temperature":0.7,"frequency_penalty":0,"presence_penalty":0, "streaming":false}' \
+    -H 'Content-Type: application/json'
+   ```
+
+   For parameters in vLLM modes, can refer to [LangChain VLLMOpenAI API](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.vllm.VLLMOpenAI.html)
+
 8. MegaService
 
    ```bash
-   curl http://${host_ip}:8888/v1/chatqna -H "Content-Type: application/json" -d '{
-        "messages": "What is the revenue of Nike in 2023?"
-        }'
+    curl http://${host_ip}:8888/v1/chatqna -H "Content-Type: application/json" -d '{
+          "messages": "What is the revenue of Nike in 2023?"
+          }'
    ```
 
 9. Dataprep Microservice（Optional）
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/README_qdrant.md b/ChatQnA/docker_compose/intel/cpu/xeon/README_qdrant.md
index 25ba15c3f..c11ab8e9f 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/README_qdrant.md
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/README_qdrant.md
@@ -304,7 +304,7 @@ docker compose -f compose_qdrant.yaml up -d
    ```bash
    curl http://${host_ip}:6047/v1/chat/completions\
      -X POST \
-     -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
+     -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
      -H 'Content-Type: application/json'
    ```
 
diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/README.md b/ChatQnA/docker_compose/intel/hpu/gaudi/README.md
index bc41c782a..ec8e3ad09 100644
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/README.md
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/README.md
@@ -442,18 +442,41 @@ For validation details, please refer to [how-to-validate_service](./how_to_valid
 7. LLM Microservice
 
    ```bash
+   # TGI service
+   curl http://${host_ip}:9000/v1/chat/completions\
+     -X POST \
+     -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
+     -H 'Content-Type: application/json'
+   ```
+
+   For parameters in TGI mode, please refer to [HuggingFace InferenceClient API](https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.text_generation) (except we rename "max_new_tokens" to "max_tokens".)
+
+   ```bash
+   # vLLM Service
    curl http://${host_ip}:9000/v1/chat/completions \
+    -X POST \
+    -d '{"query":"What is Deep Learning?","max_tokens":17,"top_p":1,"temperature":0.7,"frequency_penalty":0,"presence_penalty":0, "streaming":false}' \
+    -H 'Content-Type: application/json'
+   ```
+
+   For parameters in vLLM Mode, can refer to [LangChain VLLMOpenAI API](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.vllm.VLLMOpenAI.html)
+
+   ```bash
+   # vLLM-on-Ray Service
+   curl http://${your_ip}:9000/v1/chat/completions \
      -X POST \
-     -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
+     -d '{"query":"What is Deep Learning?","max_tokens":17,"presence_penalty":1.03","streaming":false}' \
      -H 'Content-Type: application/json'
    ```
 
+   For parameters in vLLM-on-Ray mode, can refer to [LangChain ChatOpenAI API](https://python.langchain.com/v0.2/api_reference/openai/chat_models/langchain_openai.chat_models.base.ChatOpenAI.html)
+
 8. MegaService
 
    ```bash
    curl http://${host_ip}:8888/v1/chatqna -H "Content-Type: application/json" -d '{
-        "messages": "What is the revenue of Nike in 2023?"
-        }'
+         "messages": "What is the revenue of Nike in 2023?"
+         }'
    ```
 
 9. Dataprep Microservice（Optional）
diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/how_to_validate_service.md b/ChatQnA/docker_compose/intel/hpu/gaudi/how_to_validate_service.md
index 2e2d3d023..8ada1e525 100644
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/how_to_validate_service.md
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/how_to_validate_service.md
@@ -278,7 +278,7 @@ and the log shows model warm up, please wait for a while and try it later.
 ```
 curl http://${host_ip}:9000/v1/chat/completions\
   -X POST \
-  -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
+  -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
   -H 'Content-Type: application/json'
 ```
 
diff --git a/ChatQnA/docker_compose/nvidia/gpu/README.md b/ChatQnA/docker_compose/nvidia/gpu/README.md
index cfdda158f..7e3966a7f 100644
--- a/ChatQnA/docker_compose/nvidia/gpu/README.md
+++ b/ChatQnA/docker_compose/nvidia/gpu/README.md
@@ -280,7 +280,7 @@ docker compose up -d
    ```bash
    curl http://${host_ip}:9000/v1/chat/completions \
      -X POST \
-     -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
+     -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
      -H 'Content-Type: application/json'
    ```
 
diff --git a/CodeGen/README.md b/CodeGen/README.md
index bc93ff473..fcf0f3e33 100644
--- a/CodeGen/README.md
+++ b/CodeGen/README.md
@@ -43,6 +43,8 @@ By default, the LLM model is set to a default value as listed below:
 [meta-llama/CodeLlama-7b-hf](https://huggingface.co/meta-llama/CodeLlama-7b-hf) is a gated model that requires submitting an access request through Hugging Face. You can replace it with another model.
 Change the `LLM_MODEL_ID` below for your needs, such as: [Qwen/CodeQwen1.5-7B-Chat](https://huggingface.co/Qwen/CodeQwen1.5-7B-Chat), [deepseek-ai/deepseek-coder-6.7b-instruct](https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct)
 
+If you choose to use `meta-llama/CodeLlama-7b-hf` as LLM model, you will need to visit [here](https://huggingface.co/meta-llama/CodeLlama-7b-hf), click the `Expand to review and access` button to ask for model access.
+
 ### Setup Environment Variable
 
 To set up environment variables for deploying ChatQnA services, follow these steps:
@@ -132,10 +134,13 @@ Two ways of consuming CodeGen Service:
    http_proxy=""
    curl http://${host_ip}:8028/generate \
      -X POST \
-     -d '{"inputs":"Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception.","parameters":{"max_new_tokens":256, "do_sample": true}}' \
+     -d '{"inputs":"Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception.","parameters":{"max_tokens":256, "do_sample": true}}' \
      -H 'Content-Type: application/json'
    ```
 
-2. (Docker only) If all microservices work well, check the port ${host_ip}:7778, the port may be allocated by other users, you can modify the `compose.yaml`.
+2. If you get errors like "aiohttp.client_exceptions.ClientConnectorError: Cannot connect to host xx.xx.xx.xx:8028", check the `tgi service` first. If there is "Cannot access gated repo for url
+   https://huggingface.co/meta-llama/CodeLlama-7b-hf/resolve/main/config.json." error of `tgi service`, Then you need to ask for model access first. Follow the instruction in the [Required Models](#required-models) section for more information.
+
+3. (Docker only) If all microservices work well, check the port ${host_ip}:7778, the port may be allocated by other users, you can modify the `compose.yaml`.
 
-3. (Docker only) If you get errors like "The container name is in use", change container name in `compose.yaml`.
+4. (Docker only) If you get errors like "The container name is in use", change container name in `compose.yaml`.
diff --git a/CodeGen/benchmark/accuracy/README.md b/CodeGen/benchmark/accuracy/README.md
new file mode 100644
index 000000000..16d21e1a3
--- /dev/null
+++ b/CodeGen/benchmark/accuracy/README.md
@@ -0,0 +1,100 @@
+# CodeGen accuracy Evaluation
+
+## Evaluation Framework
+
+We evaluate accuracy by [bigcode-evaluation-harness](https://github.com/bigcode-project/bigcode-evaluation-harness). It is a framework for the evaluation of code generation models.
+
+## Evaluation FAQs
+
+### Launch CodeGen microservice
+
+Please refer to [CodeGen Examples](https://github.com/opea-project/GenAIExamples/tree/main/CodeGen), follow the guide to deploy CodeGen megeservice.
+
+Use `curl` command to test codegen service and ensure that it has started properly
+
+```bash
+export CODEGEN_ENDPOINT = "http://${your_ip}:7778/v1/codegen"
+curl $CODEGEN_ENDPOINT \
+    -H "Content-Type: application/json" \
+    -d '{"messages": "Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception."}'
+
+```
+
+### Generation and Evaluation
+
+For evaluating the models on coding tasks or specifically coding LLMs, we follow the [bigcode-evaluation-harness](https://github.com/bigcode-project/bigcode-evaluation-harness) and provide the command line usage and function call usage. [HumanEval](https://huggingface.co/datasets/openai_humaneval), [HumanEval+](https://huggingface.co/datasets/evalplus/humanevalplus), [InstructHumanEval](https://huggingface.co/datasets/codeparrot/instructhumaneval), [APPS](https://huggingface.co/datasets/codeparrot/apps), [MBPP](https://huggingface.co/datasets/mbpp), [MBPP+](https://huggingface.co/datasets/evalplus/mbppplus), and [DS-1000](https://github.com/HKUNLP/DS-1000/) for both completion (left-to-right) and insertion (FIM) mode are available.
+
+#### command line usage
+
+```shell
+git clone https://github.com/opea-project/GenAIEval
+cd GenAIEval
+pip install -r requirements.txt
+pip install -e .
+
+cd evals/evaluation/bigcode_evaluation_harness/examples
+python main.py --model Qwen/CodeQwen1.5-7B-Chat \
+  --tasks humaneval \
+  --codegen_url $CODEGEN_ENDPOINT \
+  --max_length_generation 2048 \
+  --batch_size 1  \
+  --save_generations \
+  --save_references \
+  --allow_code_execution
+```
+
+**_Note:_** Currently, our framework is designed to execute tasks in full. To ensure the accuracy of results, we advise against using the 'limit' or 'limit_start' parameters to restrict the number of test samples.
+
+### accuracy Result
+
+Here is the tested result for your reference
+
+```json
+{
+  "humaneval": {
+    "pass@1": 0.7195121951219512
+  },
+  "config": {
+    "prefix": "",
+    "do_sample": true,
+    "temperature": 0.2,
+    "top_k": 0,
+    "top_p": 0.95,
+    "n_samples": 1,
+    "eos": "<|endoftext|>",
+    "seed": 0,
+    "model": "Qwen/CodeQwen1.5-7B-Chat",
+    "modeltype": "causal",
+    "peft_model": null,
+    "revision": null,
+    "use_auth_token": false,
+    "trust_remote_code": false,
+    "tasks": "humaneval",
+    "instruction_tokens": null,
+    "batch_size": 1,
+    "max_length_generation": 2048,
+    "precision": "fp32",
+    "load_in_8bit": false,
+    "load_in_4bit": false,
+    "left_padding": false,
+    "limit": null,
+    "limit_start": 0,
+    "save_every_k_tasks": -1,
+    "postprocess": true,
+    "allow_code_execution": true,
+    "generation_only": false,
+    "load_generations_path": null,
+    "load_data_path": null,
+    "metric_output_path": "evaluation_results.json",
+    "save_generations": true,
+    "load_generations_intermediate_paths": null,
+    "save_generations_path": "generations.json",
+    "save_references": true,
+    "save_references_path": "references.json",
+    "prompt": "prompt",
+    "max_memory_per_gpu": null,
+    "check_references": false,
+    "codegen_url": "http://192.168.123.104:31234/v1/codegen"
+  }
+}
+```
diff --git a/CodeGen/docker_compose/intel/cpu/xeon/README.md b/CodeGen/docker_compose/intel/cpu/xeon/README.md
index d7dc3376e..8bdde1f75 100644
--- a/CodeGen/docker_compose/intel/cpu/xeon/README.md
+++ b/CodeGen/docker_compose/intel/cpu/xeon/README.md
@@ -138,7 +138,7 @@ docker compose up -d
    ```bash
    curl http://${host_ip}:9000/v1/chat/completions\
      -X POST \
-     -d '{"query":"Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception.","max_new_tokens":256,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
+     -d '{"query":"Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception.","max_tokens":256,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
      -H 'Content-Type: application/json'
    ```
 
diff --git a/CodeGen/docker_compose/intel/hpu/gaudi/README.md b/CodeGen/docker_compose/intel/hpu/gaudi/README.md
index 74afd54ae..2a5040ea0 100644
--- a/CodeGen/docker_compose/intel/hpu/gaudi/README.md
+++ b/CodeGen/docker_compose/intel/hpu/gaudi/README.md
@@ -119,7 +119,7 @@ docker compose up -d
    ```bash
    curl http://${host_ip}:9000/v1/chat/completions\
      -X POST \
-     -d '{"query":"Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception.","max_new_tokens":256,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
+     -d '{"query":"Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception.","max_tokens":256,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
      -H 'Content-Type: application/json'
    ```
 
diff --git a/CodeGen/tests/test_gmc_on_gaudi.sh b/CodeGen/tests/test_gmc_on_gaudi.sh
index ad16e2108..805237208 100755
--- a/CodeGen/tests/test_gmc_on_gaudi.sh
+++ b/CodeGen/tests/test_gmc_on_gaudi.sh
@@ -34,7 +34,7 @@ function validate_codegen() {
     export CLIENT_POD=$(kubectl get pod -n $APP_NAMESPACE -l app=client-test -o jsonpath={.items..metadata.name})
     echo "$CLIENT_POD"
     accessUrl=$(kubectl get gmc -n $APP_NAMESPACE -o jsonpath="{.items[?(@.metadata.name=='codegen')].status.accessUrl}")
-    kubectl exec "$CLIENT_POD" -n $APP_NAMESPACE -- curl $accessUrl  -X POST  -d '{"inputs":"Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception.","parameters":{"max_new_tokens":256, "do_sample": true}}' -H 'Content-Type: application/json' > $LOG_PATH/gmc_codegen.log
+    kubectl exec "$CLIENT_POD" -n $APP_NAMESPACE -- curl $accessUrl  -X POST  -d '{"inputs":"Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception.","parameters":{"max_tokens":256, "do_sample": true}}' -H 'Content-Type: application/json' > $LOG_PATH/gmc_codegen.log
     exit_code=$?
     if [ $exit_code -ne 0 ]; then
         echo "chatqna failed, please check the logs in ${LOG_PATH}!"
diff --git a/CodeGen/tests/test_gmc_on_xeon.sh b/CodeGen/tests/test_gmc_on_xeon.sh
index 92f620365..5f3ff0eae 100755
--- a/CodeGen/tests/test_gmc_on_xeon.sh
+++ b/CodeGen/tests/test_gmc_on_xeon.sh
@@ -34,7 +34,7 @@ function validate_codegen() {
     export CLIENT_POD=$(kubectl get pod -n $APP_NAMESPACE -l app=client-test -o jsonpath={.items..metadata.name})
     echo "$CLIENT_POD"
     accessUrl=$(kubectl get gmc -n $APP_NAMESPACE -o jsonpath="{.items[?(@.metadata.name=='codegen')].status.accessUrl}")
-    kubectl exec "$CLIENT_POD" -n $APP_NAMESPACE -- curl $accessUrl  -X POST  -d '{"inputs":"Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception.","parameters":{"max_new_tokens":256, "do_sample": true}}' -H 'Content-Type: application/json' > $LOG_PATH/gmc_codegen.log
+    kubectl exec "$CLIENT_POD" -n $APP_NAMESPACE -- curl $accessUrl  -X POST  -d '{"inputs":"Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception.","parameters":{"max_tokens":256, "do_sample": true}}' -H 'Content-Type: application/json' > $LOG_PATH/gmc_codegen.log
     exit_code=$?
     if [ $exit_code -ne 0 ]; then
         echo "chatqna failed, please check the logs in ${LOG_PATH}!"
diff --git a/CodeTrans/README.md b/CodeTrans/README.md
index 0a00ca902..a1b95b154 100644
--- a/CodeTrans/README.md
+++ b/CodeTrans/README.md
@@ -127,7 +127,7 @@ By default, the UI runs on port 5173 internally.
    http_proxy=""
    curl http://${host_ip}:8008/generate \
      -X POST \
-     -d '{"inputs":"    ### System: Please translate the following Golang codes into  Python codes.    ### Original codes:    '\'''\'''\''Golang    \npackage main\n\nimport \"fmt\"\nfunc main() {\n    fmt.Println(\"Hello, World!\");\n    '\'''\'''\''    ### Translated codes:","parameters":{"max_new_tokens":17, "do_sample": true}}' \
+     -d '{"inputs":"    ### System: Please translate the following Golang codes into  Python codes.    ### Original codes:    '\'''\'''\''Golang    \npackage main\n\nimport \"fmt\"\nfunc main() {\n    fmt.Println(\"Hello, World!\");\n    '\'''\'''\''    ### Translated codes:","parameters":{"max_tokens":17, "do_sample": true}}' \
      -H 'Content-Type: application/json'
    ```
 
diff --git a/DocSum/README.md b/DocSum/README.md
index 23d662987..ca1ebfeba 100644
--- a/DocSum/README.md
+++ b/DocSum/README.md
@@ -147,9 +147,9 @@ Two ways of consuming Document Summarization Service:
 
    ```bash
    http_proxy=""
-   curl http://${your_ip}:8008/generate \
+   curl http://${host_ip}:8008/generate \
      -X POST \
-     -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
+     -d '{"inputs":"What is Deep Learning?","parameters":{"max_tokens":17, "do_sample": true}}' \
      -H 'Content-Type: application/json'
    ```
 
diff --git a/DocSum/docker_compose/intel/cpu/xeon/README.md b/DocSum/docker_compose/intel/cpu/xeon/README.md
index 4085365be..a067e9e27 100644
--- a/DocSum/docker_compose/intel/cpu/xeon/README.md
+++ b/DocSum/docker_compose/intel/cpu/xeon/README.md
@@ -105,7 +105,7 @@ docker compose up -d
 1. TGI Service
 
    ```bash
-   curl http://${your_ip}:8008/generate \
+   curl http://${host_ip}:8008/generate \
      -X POST \
      -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
      -H 'Content-Type: application/json'
@@ -114,7 +114,7 @@ docker compose up -d
 2. LLM Microservice
 
    ```bash
-   curl http://${your_ip}:9000/v1/chat/docsum \
+   curl http://${host_ip}:9000/v1/chat/docsum \
      -X POST \
      -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \
      -H 'Content-Type: application/json'
diff --git a/DocSum/docker_compose/intel/hpu/gaudi/README.md b/DocSum/docker_compose/intel/hpu/gaudi/README.md
index 8ef3b2916..abb4a9bed 100644
--- a/DocSum/docker_compose/intel/hpu/gaudi/README.md
+++ b/DocSum/docker_compose/intel/hpu/gaudi/README.md
@@ -96,7 +96,7 @@ docker compose up -d
 1. TGI Service
 
    ```bash
-   curl http://${your_ip}:8008/generate \
+   curl http://${host_ip}:8008/generate \
      -X POST \
      -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":64, "do_sample": true}}' \
      -H 'Content-Type: application/json'
@@ -105,7 +105,7 @@ docker compose up -d
 2. LLM Microservice
 
    ```bash
-   curl http://${your_ip}:9000/v1/chat/docsum \
+   curl http://${host_ip}:9000/v1/chat/docsum \
      -X POST \
      -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \
      -H 'Content-Type: application/json'
diff --git a/FaqGen/benchmark/accuracy/README.md b/FaqGen/benchmark/accuracy/README.md
new file mode 100644
index 000000000..1c180c395
--- /dev/null
+++ b/FaqGen/benchmark/accuracy/README.md
@@ -0,0 +1,78 @@
+# FaqGen Evaluation
+
+## Dataset
+
+We evaluate performance on QA dataset [Squad_v2](https://huggingface.co/datasets/rajpurkar/squad_v2). Generate FAQs on "context" columns in validation dataset, which contains 1204 unique records.
+
+First download dataset and put at "./data".
+
+Extract unique "context" columns, which will be save to 'data/sqv2_context.json':
+
+```
+python get_context.py
+```
+
+## Generate FAQs
+
+### Launch FaQGen microservice
+
+Please refer to [FaQGen microservice](https://github.com/opea-project/GenAIComps/tree/main/comps/llms/faq-generation/tgi), set up an microservice endpoint.
+
+```
+export FAQ_ENDPOINT = "http://${your_ip}:9000/v1/faqgen"
+```
+
+### Generate FAQs with microservice
+
+Use the microservice endpoint to generate FAQs for dataset.
+
+```
+python generate_FAQ.py
+```
+
+Post-process the output to get the right data, which will be save to 'data/sqv2_faq.json'.
+
+```
+python post_process_FAQ.py
+```
+
+## Evaluate with Ragas
+
+### Launch TGI service
+
+We use "mistralai/Mixtral-8x7B-Instruct-v0.1" as LLM referee to evaluate the model. First we need to launch a LLM endpoint on Gaudi.
+
+```
+export HUGGING_FACE_HUB_TOKEN="your_huggingface_token"
+bash launch_tgi.sh
+```
+
+Get the endpoint:
+
+```
+export LLM_ENDPOINT = "http://${ip_address}:8082"
+```
+
+Verify the service:
+
+```bash
+curl http://${ip_address}:8082/generate \
+    -X POST \
+    -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":128}}' \
+    -H 'Content-Type: application/json'
+```
+
+### Evaluate
+
+evaluate the performance with the LLM:
+
+```
+python evaluate.py
+```
+
+### Performance Result
+
+Here is the tested result for your reference
+| answer_relevancy | faithfulness | context_utilization | reference_free_rubrics_score |
+| ---- | ---- |---- |---- |
+| 0.7191 | 0.9681 | 0.8964 | 4.4125|
diff --git a/FaqGen/benchmark/accuracy/evaluate.py b/FaqGen/benchmark/accuracy/evaluate.py
new file mode 100644
index 000000000..30998da4d
--- /dev/null
+++ b/FaqGen/benchmark/accuracy/evaluate.py
@@ -0,0 +1,44 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+import os
+
+from evals.metrics.ragas import RagasMetric
+from langchain_community.embeddings import HuggingFaceBgeEmbeddings
+
+llm_endpoint = os.getenv("LLM_ENDPOINT", "http://0.0.0.0:8082")
+
+f = open("data/sqv2_context.json", "r")
+sqv2_context = json.load(f)
+
+f = open("data/sqv2_faq.json", "r")
+sqv2_faq = json.load(f)
+
+templ = """Create a concise FAQs (frequently asked questions and answers) for following text:
+        TEXT: {text}
+        Do not use any prefix or suffix to the FAQ.
+    """
+
+number = 1204
+question = []
+answer = []
+ground_truth = ["None"] * number
+contexts = []
+for i in range(number):
+    inputs = sqv2_context[str(i)]
+    inputs_faq = templ.format_map({"text": inputs})
+    actual_output = sqv2_faq[str(i)]
+
+    question.append(inputs_faq)
+    answer.append(actual_output)
+    contexts.append([inputs_faq])
+
+embeddings = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-base-en-v1.5")
+metrics_faq = ["answer_relevancy", "faithfulness", "context_utilization", "reference_free_rubrics_score"]
+metric = RagasMetric(threshold=0.5, model=llm_endpoint, embeddings=embeddings, metrics=metrics_faq)
+
+test_case = {"question": question, "answer": answer, "ground_truth": ground_truth, "contexts": contexts}
+
+metric.measure(test_case)
+print(metric.score)
diff --git a/FaqGen/benchmark/accuracy/generate_FAQ.py b/FaqGen/benchmark/accuracy/generate_FAQ.py
new file mode 100644
index 000000000..2ed70b9ef
--- /dev/null
+++ b/FaqGen/benchmark/accuracy/generate_FAQ.py
@@ -0,0 +1,28 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+import os
+import time
+
+import requests
+
+llm_endpoint = os.getenv("FAQ_ENDPOINT", "http://0.0.0.0:9000/v1/faqgen")
+
+f = open("data/sqv2_context.json", "r")
+sqv2_context = json.load(f)
+
+start_time = time.time()
+headers = {"Content-Type": "application/json"}
+for i in range(1204):
+    start_time_tmp = time.time()
+    print(i)
+    inputs = sqv2_context[str(i)]
+    data = {"query": inputs, "max_new_tokens": 128}
+    response = requests.post(llm_endpoint, json=data, headers=headers)
+    f = open(f"data/result/sqv2_faq_{i}", "w")
+    f.write(inputs)
+    f.write(str(response.content, encoding="utf-8"))
+    f.close()
+    print(f"Cost {time.time()-start_time_tmp} seconds")
+print(f"\n Finished! \n Totally Cost {time.time()-start_time} seconds\n")
diff --git a/FaqGen/benchmark/accuracy/get_context.py b/FaqGen/benchmark/accuracy/get_context.py
new file mode 100644
index 000000000..8cb73a054
--- /dev/null
+++ b/FaqGen/benchmark/accuracy/get_context.py
@@ -0,0 +1,17 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+import os
+
+import pandas as pd
+
+data_path = "./data"
+data = pd.read_parquet(os.path.join(data_path, "squad_v2/squad_v2/validation-00000-of-00001.parquet"))
+sq_context = list(data["context"].unique())
+sq_context_d = dict()
+for i in range(len(sq_context)):
+    sq_context_d[i] = sq_context[i]
+
+with open(os.path.join(data_path, "sqv2_context.json"), "w") as outfile:
+    json.dump(sq_context_d, outfile)
diff --git a/FaqGen/benchmark/accuracy/launch_tgi.sh b/FaqGen/benchmark/accuracy/launch_tgi.sh
new file mode 100644
index 000000000..b3e04bbbf
--- /dev/null
+++ b/FaqGen/benchmark/accuracy/launch_tgi.sh
@@ -0,0 +1,28 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+max_input_tokens=3072
+max_total_tokens=4096
+port_number=8082
+model_name="mistralai/Mixtral-8x7B-Instruct-v0.1"
+volume="./data"
+docker run -it --rm \
+    --name="tgi_Mixtral" \
+    -p $port_number:80 \
+    -v $volume:/data \
+    --runtime=habana \
+    --restart always \
+    -e HUGGING_FACE_HUB_TOKEN=$HUGGING_FACE_HUB_TOKEN \
+    -e HABANA_VISIBLE_DEVICES=all \
+    -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
+    -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true \
+    --cap-add=sys_nice \
+    --ipc=host \
+    -e HTTPS_PROXY=$https_proxy \
+    -e HTTP_PROXY=$https_proxy \
+    ghcr.io/huggingface/tgi-gaudi:2.0.1 \
+    --model-id $model_name \
+    --max-input-tokens $max_input_tokens \
+    --max-total-tokens $max_total_tokens \
+    --sharded true \
+    --num-shard 2
diff --git a/FaqGen/benchmark/accuracy/post_process_FAQ.py b/FaqGen/benchmark/accuracy/post_process_FAQ.py
new file mode 100644
index 000000000..83e6b8350
--- /dev/null
+++ b/FaqGen/benchmark/accuracy/post_process_FAQ.py
@@ -0,0 +1,27 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+
+faq_dict = {}
+fails = []
+for i in range(1204):
+    data = open(f"data/result/sqv2_faq_{i}", "r").readlines()
+    result = data[-6][6:]
+    # print(result)
+    if "LLMChain/final_output" not in result:
+        print(f"error1: fail for {i}")
+        fails.append(i)
+        continue
+    try:
+        result2 = json.loads(result)
+        result3 = result2["ops"][0]["value"]["text"]
+        faq_dict[str(i)] = result3
+    except:
+        print(f"error2: fail for {i}")
+        fails.append(i)
+        continue
+with open("data/sqv2_faq.json", "w") as outfile:
+    json.dump(faq_dict, outfile)
+print("Failure index:")
+print(fails)
diff --git a/MultimodalQnA/ui/gradio/multimodalqna_ui_gradio.py b/MultimodalQnA/ui/gradio/multimodalqna_ui_gradio.py
index 65696cc54..3eba01a71 100644
--- a/MultimodalQnA/ui/gradio/multimodalqna_ui_gradio.py
+++ b/MultimodalQnA/ui/gradio/multimodalqna_ui_gradio.py
@@ -25,6 +25,7 @@
     display:block;
 }
 """
+tmp_upload_folder = "/tmp/gradio/"
 
 # create a FastAPI app
 app = FastAPI()
@@ -122,11 +123,14 @@ def http_bot(state, request: gr.Request):
                 video_file = metadata["source_video"]
                 state.video_file = os.path.join(static_dir, metadata["source_video"])
                 state.time_of_frame_ms = metadata["time_of_frame_ms"]
-                splited_video_path = split_video(
-                    state.video_file, state.time_of_frame_ms, tmp_dir, f"{state.time_of_frame_ms}__{video_file}"
-                )
+                try:
+                    splited_video_path = split_video(
+                        state.video_file, state.time_of_frame_ms, tmp_dir, f"{state.time_of_frame_ms}__{video_file}"
+                    )
+                except:
+                    print(f"video {state.video_file} does not exist in UI host!")
+                    splited_video_path = None
                 state.split_video = splited_video_path
-                print(splited_video_path)
         else:
             raise requests.exceptions.RequestException
     except requests.exceptions.RequestException as e:
@@ -143,9 +147,19 @@ def http_bot(state, request: gr.Request):
 
 def ingest_video_gen_transcript(filepath, request: gr.Request):
     yield (gr.Textbox(visible=True, value="Please wait for ingesting your uploaded video into database..."))
-    basename = os.path.basename(filepath)
+    verified_filepath = os.path.normpath(filepath)
+    if not verified_filepath.startswith(tmp_upload_folder):
+        print("Found malicious video file name!")
+        yield (
+            gr.Textbox(
+                visible=True,
+                value="Your uploaded video's file name has special characters that are not allowed. Please consider update the video file name!",
+            )
+        )
+        return
+    basename = os.path.basename(verified_filepath)
     dest = os.path.join(static_dir, basename)
-    shutil.copy(filepath, dest)
+    shutil.copy(verified_filepath, dest)
     print("Done copy uploaded file to static folder!")
     headers = {
         # 'Content-Type': 'multipart/form-data'
@@ -185,9 +199,19 @@ def ingest_video_gen_transcript(filepath, request: gr.Request):
 
 def ingest_video_gen_caption(filepath, request: gr.Request):
     yield (gr.Textbox(visible=True, value="Please wait for ingesting your uploaded video into database..."))
-    basename = os.path.basename(filepath)
+    verified_filepath = os.path.normpath(filepath)
+    if not verified_filepath.startswith(tmp_upload_folder):
+        print("Found malicious video file name!")
+        yield (
+            gr.Textbox(
+                visible=True,
+                value="Your uploaded video's file name has special characters that are not allowed. Please consider update the video file name!",
+            )
+        )
+        return
+    basename = os.path.basename(verified_filepath)
     dest = os.path.join(static_dir, basename)
-    shutil.copy(filepath, dest)
+    shutil.copy(verified_filepath, dest)
     print("Done copy uploaded file to static folder!")
     headers = {
         # 'Content-Type': 'multipart/form-data'
diff --git a/ProductivitySuite/docker_compose/intel/cpu/xeon/README.md b/ProductivitySuite/docker_compose/intel/cpu/xeon/README.md
index dd1f59f27..312f191ff 100644
--- a/ProductivitySuite/docker_compose/intel/cpu/xeon/README.md
+++ b/ProductivitySuite/docker_compose/intel/cpu/xeon/README.md
@@ -271,7 +271,7 @@ Please refer to [keycloak_setup_guide](keycloak_setup_guide.md) for more detail
    ```bash
    curl http://${host_ip}:9000/v1/chat/completions\
      -X POST \
-     -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
+     -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
      -H 'Content-Type: application/json'
    ```
 
diff --git a/SearchQnA/docker_compose/intel/cpu/xeon/README.md b/SearchQnA/docker_compose/intel/cpu/xeon/README.md
index f31975ac6..5dbd77464 100644
--- a/SearchQnA/docker_compose/intel/cpu/xeon/README.md
+++ b/SearchQnA/docker_compose/intel/cpu/xeon/README.md
@@ -140,7 +140,7 @@ curl http://${host_ip}:3006/generate \
 # llm microservice
 curl http://${host_ip}:3007/v1/chat/completions\
   -X POST \
-  -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
+  -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
   -H 'Content-Type: application/json'
 
 ```
diff --git a/SearchQnA/docker_compose/intel/hpu/gaudi/README.md b/SearchQnA/docker_compose/intel/hpu/gaudi/README.md
index b34398c35..6021c7938 100644
--- a/SearchQnA/docker_compose/intel/hpu/gaudi/README.md
+++ b/SearchQnA/docker_compose/intel/hpu/gaudi/README.md
@@ -150,7 +150,7 @@ curl http://${host_ip}:3006/generate \
 # llm microservice
 curl http://${host_ip}:3007/v1/chat/completions\
   -X POST \
-  -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
+  -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
   -H 'Content-Type: application/json'
 
 ```
diff --git a/Translation/docker_compose/intel/cpu/xeon/README.md b/Translation/docker_compose/intel/cpu/xeon/README.md
index 306f8e35d..651b20950 100644
--- a/Translation/docker_compose/intel/cpu/xeon/README.md
+++ b/Translation/docker_compose/intel/cpu/xeon/README.md
@@ -10,9 +10,24 @@ For detailed information about these instance types, you can refer to this [link
 
 After launching your instance, you can connect to it using SSH (for Linux instances) or Remote Desktop Protocol (RDP) (for Windows instances). From there, you'll have full access to your Xeon server, allowing you to install, configure, and manage your applications as needed.
 
-## 🚀 Build Docker Images
+## 🚀 Prepare Docker Images
 
-First of all, you need to build Docker Images locally and install the python package of it.
+For Docker Images, you have two options to prepare them.
+
+1. Pull the docker images from docker hub.
+
+   - More stable to use.
+   - Will be automatically downloaded when using docker compose command.
+
+2. Build the docker images from source.
+
+   - Contain the latest new features.
+
+   - Need to be manually build.
+
+If you choose to pull docker images form docker hub, skip this section and go to [Start Microservices](#start-microservices) part directly.
+
+Follow the instructions below to build the docker images from source.
 
 ### 1. Build LLM Image
 
@@ -45,7 +60,7 @@ docker build -t opea/translation-ui:latest --build-arg https_proxy=$https_proxy
 
 ```bash
 cd GenAIComps
-docker build -t opea/translation-nginx:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/nginx/Dockerfile .
+docker build -t opea/nginx:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/nginx/Dockerfile .
 ```
 
 Then run the command `docker images`, you will have the following Docker Images:
@@ -53,7 +68,7 @@ Then run the command `docker images`, you will have the following Docker Images:
 1. `opea/llm-tgi:latest`
 2. `opea/translation:latest`
 3. `opea/translation-ui:latest`
-4. `opea/translation-nginx:latest`
+4. `opea/nginx:latest`
 
 ## 🚀 Start Microservices
 
@@ -101,6 +116,15 @@ Change the `LLM_MODEL_ID` below for your needs.
 docker compose up -d
 ```
 
+> Note: The docker images will be automatically downloaded from `docker hub`:
+
+```bash
+docker pull opea/llm-tgi:latest
+docker pull opea/translation:latest
+docker pull opea/translation-ui:latest
+docker pull opea/nginx:latest
+```
+
 ### Validate Microservices
 
 1. TGI Service
diff --git a/Translation/docker_compose/intel/cpu/xeon/compose.yaml b/Translation/docker_compose/intel/cpu/xeon/compose.yaml
index e8eafca4f..108a5086d 100644
--- a/Translation/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/Translation/docker_compose/intel/cpu/xeon/compose.yaml
@@ -66,7 +66,7 @@ services:
     ipc: host
     restart: always
   translation-xeon-nginx-server:
-    image: ${REGISTRY:-opea}/translation-nginx:${TAG:-latest}
+    image: ${REGISTRY:-opea}/nginx:${TAG:-latest}
     container_name: translation-xeon-nginx-server
     depends_on:
       - translation-xeon-backend-server
diff --git a/Translation/docker_compose/intel/hpu/gaudi/README.md b/Translation/docker_compose/intel/hpu/gaudi/README.md
index 9f234496c..a9e807a12 100644
--- a/Translation/docker_compose/intel/hpu/gaudi/README.md
+++ b/Translation/docker_compose/intel/hpu/gaudi/README.md
@@ -2,9 +2,24 @@
 
 This document outlines the deployment process for a Translation application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Gaudi server. The steps include Docker image creation, container deployment via Docker Compose, and service execution to integrate microservices such as We will publish the Docker images to Docker Hub, it will simplify the deployment process for this service.
 
-## 🚀 Build Docker Images
+## 🚀 Prepare Docker Images
 
-First of all, you need to build Docker Images locally. This step can be ignored after the Docker images published to Docker hub.
+For Docker Images, you have two options to prepare them.
+
+1. Pull the docker images from docker hub.
+
+   - More stable to use.
+   - Will be automatically downloaded when using docker compose command.
+
+2. Build the docker images from source.
+
+   - Contain the latest new features.
+
+   - Need to be manually build.
+
+If you choose to pull docker images form docker hub, skip to [Start Microservices](#start-microservices) part directly.
+
+Follow the instructions below to build the docker images from source.
 
 ### 1. Build LLM Image
 
@@ -37,7 +52,7 @@ docker build -t opea/translation-ui:latest --build-arg https_proxy=$https_proxy
 
 ```bash
 cd GenAIComps
-docker build -t opea/translation-nginx:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/nginx/Dockerfile .
+docker build -t opea/nginx:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/nginx/Dockerfile .
 ```
 
 Then run the command `docker images`, you will have the following four Docker Images:
@@ -45,7 +60,7 @@ Then run the command `docker images`, you will have the following four Docker Im
 1. `opea/llm-tgi:latest`
 2. `opea/translation:latest`
 3. `opea/translation-ui:latest`
-4. `opea/translation-nginx:latest`
+4. `opea/nginx:latest`
 
 ## 🚀 Start Microservices
 
@@ -93,6 +108,15 @@ Change the `LLM_MODEL_ID` below for your needs.
 docker compose up -d
 ```
 
+> Note: The docker images will be automatically downloaded from `docker hub`:
+
+```bash
+docker pull opea/llm-tgi:latest
+docker pull opea/translation:latest
+docker pull opea/translation-ui:latest
+docker pull opea/nginx:latest
+```
+
 ### Validate Microservices
 
 1. TGI Service
diff --git a/Translation/docker_compose/intel/hpu/gaudi/compose.yaml b/Translation/docker_compose/intel/hpu/gaudi/compose.yaml
index 6eefd6492..3d8b0ab47 100644
--- a/Translation/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/Translation/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -67,7 +67,7 @@ services:
     ipc: host
     restart: always
   translation-gaudi-nginx-server:
-    image: ${REGISTRY:-opea}/translation-nginx:${TAG:-latest}
+    image: ${REGISTRY:-opea}/nginx:${TAG:-latest}
     container_name: translation-gaudi-nginx-server
     depends_on:
       - translation-gaudi-backend-server
diff --git a/Translation/docker_image_build/build.yaml b/Translation/docker_image_build/build.yaml
index a1562060b..2230a8657 100644
--- a/Translation/docker_image_build/build.yaml
+++ b/Translation/docker_image_build/build.yaml
@@ -28,4 +28,4 @@ services:
       context: GenAIComps
       dockerfile: comps/nginx/Dockerfile
     extends: translation
-    image: ${REGISTRY:-opea}/translation-nginx:${TAG:-latest}
+    image: ${REGISTRY:-opea}/nginx:${TAG:-latest}
diff --git a/VisualQnA/docker_compose/intel/cpu/xeon/README.md b/VisualQnA/docker_compose/intel/cpu/xeon/README.md
index 3a6058e0c..8f0d5b6b3 100644
--- a/VisualQnA/docker_compose/intel/cpu/xeon/README.md
+++ b/VisualQnA/docker_compose/intel/cpu/xeon/README.md
@@ -138,28 +138,28 @@ Follow the instructions to validate MicroServices.
 
 2. MegaService
 
-   ```bash
-   curl http://${host_ip}:8888/v1/visualqna -H "Content-Type: application/json" -d '{
-        "messages": [
-         {
-           "role": "user",
-           "content": [
-             {
-               "type": "text",
-               "text": "What'\''s in this image?"
-             },
-             {
-               "type": "image_url",
-               "image_url": {
-                 "url": "https://www.ilankelman.org/stopsigns/australia.jpg"
-               }
-             }
-           ]
-         }
-       ],
-       "max_tokens": 300
-       }'
-   ```
+```bash
+curl http://${host_ip}:8888/v1/visualqna -H "Content-Type: application/json" -d '{
+    "messages": [
+      {
+        "role": "user",
+        "content": [
+          {
+            "type": "text",
+            "text": "What'\''s in this image?"
+          },
+          {
+            "type": "image_url",
+            "image_url": {
+              "url": "https://www.ilankelman.org/stopsigns/australia.jpg"
+            }
+          }
+        ]
+      }
+    ],
+    "max_tokens": 300
+    }'
+```
 
 ## 🚀 Launch the UI
 
diff --git a/VisualQnA/docker_compose/intel/hpu/gaudi/README.md b/VisualQnA/docker_compose/intel/hpu/gaudi/README.md
index 2a8f3a276..84783353a 100644
--- a/VisualQnA/docker_compose/intel/hpu/gaudi/README.md
+++ b/VisualQnA/docker_compose/intel/hpu/gaudi/README.md
@@ -95,28 +95,28 @@ Follow the instructions to validate MicroServices.
 
 2. MegaService
 
-   ```bash
-   curl http://${host_ip}:8888/v1/visualqna -H "Content-Type: application/json" -d '{
-        "messages": [
-         {
-           "role": "user",
-           "content": [
-             {
-               "type": "text",
-               "text": "What'\''s in this image?"
-             },
-             {
-               "type": "image_url",
-               "image_url": {
-                 "url": "https://www.ilankelman.org/stopsigns/australia.jpg"
-               }
-             }
-           ]
-         }
-       ],
-       "max_tokens": 300
-       }'
-   ```
+```bash
+curl http://${host_ip}:8888/v1/visualqna -H "Content-Type: application/json" -d '{
+    "messages": [
+      {
+        "role": "user",
+        "content": [
+          {
+            "type": "text",
+            "text": "What'\''s in this image?"
+          },
+          {
+            "type": "image_url",
+            "image_url": {
+              "url": "https://www.ilankelman.org/stopsigns/australia.jpg"
+            }
+          }
+        ]
+      }
+    ],
+    "max_tokens": 300
+    }'
+```
 
 ## 🚀 Launch the UI