From 72840399d63bf86a38896799e8311a2f7c35b1ab Mon Sep 17 00:00:00 2001 From: root Date: Thu, 12 Sep 2024 05:43:22 +0000 Subject: [PATCH 1/2] add tgi bf16 setup on CPU. --- ChatQnA/kubernetes/intel/README.md | 11 + .../intel/cpu/xeon/manifest/chatqna_bf16.yaml | 1467 +++++++++++++++++ 2 files changed, 1478 insertions(+) create mode 100644 ChatQnA/kubernetes/intel/cpu/xeon/manifest/chatqna_bf16.yaml diff --git a/ChatQnA/kubernetes/intel/README.md b/ChatQnA/kubernetes/intel/README.md index 86dde2c54..81285528a 100644 --- a/ChatQnA/kubernetes/intel/README.md +++ b/ChatQnA/kubernetes/intel/README.md @@ -17,6 +17,17 @@ sed -i "s/insert-your-huggingface-token-here/${HUGGINGFACEHUB_API_TOKEN}/g" chat kubectl apply -f chatqna.yaml ``` +Since CPUs, such as Intel Cooper Lake, Sapphire Rapids, support `bfloat16`, we can add `--dtype bfloat16` when setup the `huggingface/text-generation-inference` server. And if you have such CPUs, you can run the following commands: + +``` +# label your node for scheduling the service on it automatically +kubectl label node 'your-node-name' node-type=node-bfloat16 + +# add `nodeSelector` for the `huggingface/text-generation-inference` server at `chatqna_bf16.yaml` +# create +kubectl apply -f chatqna_bf16.yaml +``` + ## Deploy On Gaudi ``` diff --git a/ChatQnA/kubernetes/intel/cpu/xeon/manifest/chatqna_bf16.yaml b/ChatQnA/kubernetes/intel/cpu/xeon/manifest/chatqna_bf16.yaml new file mode 100644 index 000000000..3225cb79b --- /dev/null +++ b/ChatQnA/kubernetes/intel/cpu/xeon/manifest/chatqna_bf16.yaml @@ -0,0 +1,1467 @@ +--- +# Source: chatqna/charts/chatqna-ui/templates/configmap.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: ConfigMap +metadata: + name: chatqna-chatqna-ui-config + labels: + helm.sh/chart: chatqna-ui-1.0.0 + app.kubernetes.io/name: chatqna-ui + app.kubernetes.io/instance: chatqna + app.kubernetes.io/version: "v1.0" + app.kubernetes.io/managed-by: Helm +data: + APP_BACKEND_SERVICE_ENDPOINT: "/v1/chatqna" + APP_DATA_PREP_SERVICE_URL: "/v1/dataprep" + CHAT_BASE_URL: "/v1/chatqna" + UPLOAD_FILE_BASE_URL: "/v1/dataprep" + GET_FILE: "/v1/dataprep/get_file" + DELETE_FILE: "/v1/dataprep/delete_file" + BASE_URL: "/v1/chatqna" + DOC_BASE_URL: "/v1/chatqna" + BASIC_URL: "/v1/chatqna" + VITE_CODE_GEN_URL: "/v1/chatqna" + VITE_DOC_SUM_URL: "/v1/chatqna" +--- +# Source: chatqna/charts/data-prep/templates/configmap.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: ConfigMap +metadata: + name: chatqna-data-prep-config + labels: + helm.sh/chart: data-prep-1.0.0 + app.kubernetes.io/name: data-prep + app.kubernetes.io/instance: chatqna + app.kubernetes.io/version: "v1.0" + app.kubernetes.io/managed-by: Helm +data: + TEI_ENDPOINT: "http://chatqna-tei" + EMBED_MODEL: "" + REDIS_URL: "redis://chatqna-redis-vector-db:6379" + INDEX_NAME: "rag-redis" + KEY_INDEX_NAME: "file-keys" + SEARCH_BATCH_SIZE: "10" + HUGGINGFACEHUB_API_TOKEN: "insert-your-huggingface-token-here" + HF_HOME: "/tmp/.cache/huggingface" + http_proxy: "" + https_proxy: "" + no_proxy: "" + LOGFLAG: "" +--- +# Source: chatqna/charts/embedding-usvc/templates/configmap.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: ConfigMap +metadata: + name: chatqna-embedding-usvc-config + labels: + helm.sh/chart: embedding-usvc-1.0.0 + app.kubernetes.io/name: embedding-usvc + app.kubernetes.io/instance: chatqna + app.kubernetes.io/version: "v1.0" + app.kubernetes.io/managed-by: Helm +data: + TEI_EMBEDDING_ENDPOINT: "http://chatqna-tei" + http_proxy: "" + https_proxy: "" + no_proxy: "" + LOGFLAG: "" +--- +# Source: chatqna/charts/llm-uservice/templates/configmap.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: ConfigMap +metadata: + name: chatqna-llm-uservice-config + labels: + helm.sh/chart: llm-uservice-1.0.0 + app.kubernetes.io/name: llm-uservice + app.kubernetes.io/instance: chatqna + app.kubernetes.io/version: "v1.0" + app.kubernetes.io/managed-by: Helm +data: + TGI_LLM_ENDPOINT: "http://chatqna-tgi" + HUGGINGFACEHUB_API_TOKEN: "insert-your-huggingface-token-here" + HF_HOME: "/tmp/.cache/huggingface" + http_proxy: "" + https_proxy: "" + no_proxy: "" + LOGFLAG: "" +--- +# Source: chatqna/charts/reranking-usvc/templates/configmap.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: ConfigMap +metadata: + name: chatqna-reranking-usvc-config + labels: + helm.sh/chart: reranking-usvc-1.0.0 + app.kubernetes.io/name: reranking-usvc + app.kubernetes.io/instance: chatqna + app.kubernetes.io/version: "v1.0" + app.kubernetes.io/managed-by: Helm +data: + TEI_RERANKING_ENDPOINT: "http://chatqna-teirerank" + http_proxy: "" + https_proxy: "" + no_proxy: "" + LOGFLAG: "" +--- +# Source: chatqna/charts/retriever-usvc/templates/configmap.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: ConfigMap +metadata: + name: chatqna-retriever-usvc-config + labels: + helm.sh/chart: retriever-usvc-1.0.0 + app.kubernetes.io/name: retriever-usvc + app.kubernetes.io/instance: chatqna + app.kubernetes.io/version: "v1.0" + app.kubernetes.io/managed-by: Helm +data: + TEI_EMBEDDING_ENDPOINT: "http://chatqna-tei" + EMBED_MODEL: "" + REDIS_URL: "redis://chatqna-redis-vector-db:6379" + INDEX_NAME: "rag-redis" + EASYOCR_MODULE_PATH: "/tmp/.EasyOCR" + http_proxy: "" + https_proxy: "" + no_proxy: "" + HF_HOME: "/tmp/.cache/huggingface" + HUGGINGFACEHUB_API_TOKEN: "insert-your-huggingface-token-here" + LOGFLAG: "" +--- +# Source: chatqna/charts/tei/templates/configmap.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: ConfigMap +metadata: + name: chatqna-tei-config + labels: + helm.sh/chart: tei-1.0.0 + app.kubernetes.io/name: tei + app.kubernetes.io/instance: chatqna + app.kubernetes.io/version: "cpu-1.5" + app.kubernetes.io/managed-by: Helm +data: + MODEL_ID: "BAAI/bge-base-en-v1.5" + PORT: "2081" + http_proxy: "" + https_proxy: "" + no_proxy: "" + NUMBA_CACHE_DIR: "/tmp" + TRANSFORMERS_CACHE: "/tmp/transformers_cache" + HF_HOME: "/tmp/.cache/huggingface" + MAX_WARMUP_SEQUENCE_LENGTH: "512" +--- +# Source: chatqna/charts/teirerank/templates/configmap.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: ConfigMap +metadata: + name: chatqna-teirerank-config + labels: + helm.sh/chart: teirerank-1.0.0 + app.kubernetes.io/name: teirerank + app.kubernetes.io/instance: chatqna + app.kubernetes.io/version: "cpu-1.5" + app.kubernetes.io/managed-by: Helm +data: + MODEL_ID: "BAAI/bge-reranker-base" + PORT: "2082" + http_proxy: "" + https_proxy: "" + no_proxy: "" + NUMBA_CACHE_DIR: "/tmp" + TRANSFORMERS_CACHE: "/tmp/transformers_cache" + HF_HOME: "/tmp/.cache/huggingface" +--- +# Source: chatqna/charts/tgi/templates/configmap.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: ConfigMap +metadata: + name: chatqna-tgi-config + labels: + helm.sh/chart: tgi-1.0.0 + app.kubernetes.io/name: tgi + app.kubernetes.io/instance: chatqna + app.kubernetes.io/version: "2.1.0" + app.kubernetes.io/managed-by: Helm +data: + MODEL_ID: "Intel/neural-chat-7b-v3-3" + DTYPE: "bfloat16" + PORT: "2080" + HF_TOKEN: "insert-your-huggingface-token-here" + http_proxy: "" + https_proxy: "" + no_proxy: "" + HABANA_LOGS: "/tmp/habana_logs" + NUMBA_CACHE_DIR: "/tmp" + HF_HOME: "/tmp/.cache/huggingface" + CUDA_GRAPHS: "0" +--- +# Source: chatqna/templates/nginx-deployment.yaml +apiVersion: v1 +data: + default.conf: |+ + # Copyright (C) 2024 Intel Corporation + # SPDX-License-Identifier: Apache-2.0 + + + server { + listen 80; + listen [::]:80; + + location /home { + alias /usr/share/nginx/html/index.html; + } + + location / { + proxy_pass http://chatqna-chatqna-ui:5174; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } + + location /v1/chatqna { + proxy_pass http://chatqna:8888; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } + + location /v1/dataprep { + proxy_pass http://chatqna-data-prep:6007; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } + + location /v1/dataprep/get_file { + proxy_pass http://chatqna-data-prep:6007; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } + + location /v1/dataprep/delete_file { + proxy_pass http://chatqna-data-prep:6007; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } + } + +kind: ConfigMap +metadata: + name: chatqna-nginx-config +--- +# Source: chatqna/charts/chatqna-ui/templates/service.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: Service +metadata: + name: chatqna-chatqna-ui + labels: + helm.sh/chart: chatqna-ui-1.0.0 + app.kubernetes.io/name: chatqna-ui + app.kubernetes.io/instance: chatqna + app.kubernetes.io/version: "v1.0" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 5174 + targetPort: ui + protocol: TCP + name: ui + selector: + app.kubernetes.io/name: chatqna-ui + app.kubernetes.io/instance: chatqna +--- +# Source: chatqna/charts/data-prep/templates/service.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: Service +metadata: + name: chatqna-data-prep + labels: + helm.sh/chart: data-prep-1.0.0 + app.kubernetes.io/name: data-prep + app.kubernetes.io/instance: chatqna + app.kubernetes.io/version: "v1.0" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 6007 + targetPort: 6007 + protocol: TCP + name: data-prep + selector: + app.kubernetes.io/name: data-prep + app.kubernetes.io/instance: chatqna +--- +# Source: chatqna/charts/embedding-usvc/templates/service.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: Service +metadata: + name: chatqna-embedding-usvc + labels: + helm.sh/chart: embedding-usvc-1.0.0 + app.kubernetes.io/name: embedding-usvc + app.kubernetes.io/instance: chatqna + app.kubernetes.io/version: "v1.0" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 6000 + targetPort: 6000 + protocol: TCP + name: embedding-usvc + selector: + app.kubernetes.io/name: embedding-usvc + app.kubernetes.io/instance: chatqna +--- +# Source: chatqna/charts/llm-uservice/templates/service.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: Service +metadata: + name: chatqna-llm-uservice + labels: + helm.sh/chart: llm-uservice-1.0.0 + app.kubernetes.io/name: llm-uservice + app.kubernetes.io/instance: chatqna + app.kubernetes.io/version: "v1.0" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 9000 + targetPort: 9000 + protocol: TCP + name: llm-uservice + selector: + app.kubernetes.io/name: llm-uservice + app.kubernetes.io/instance: chatqna +--- +# Source: chatqna/charts/redis-vector-db/templates/service.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: Service +metadata: + name: chatqna-redis-vector-db + labels: + helm.sh/chart: redis-vector-db-1.0.0 + app.kubernetes.io/name: redis-vector-db + app.kubernetes.io/instance: chatqna + app.kubernetes.io/version: "7.2.0-v9" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 6379 + targetPort: 6379 + protocol: TCP + name: redis-service + - port: 8001 + targetPort: 8001 + protocol: TCP + name: redis-insight + selector: + app.kubernetes.io/name: redis-vector-db + app.kubernetes.io/instance: chatqna +--- +# Source: chatqna/charts/reranking-usvc/templates/service.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: Service +metadata: + name: chatqna-reranking-usvc + labels: + helm.sh/chart: reranking-usvc-1.0.0 + app.kubernetes.io/name: reranking-usvc + app.kubernetes.io/instance: chatqna + app.kubernetes.io/version: "v1.0" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 8000 + targetPort: 8000 + protocol: TCP + name: reranking-usvc + selector: + app.kubernetes.io/name: reranking-usvc + app.kubernetes.io/instance: chatqna +--- +# Source: chatqna/charts/retriever-usvc/templates/service.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: Service +metadata: + name: chatqna-retriever-usvc + labels: + helm.sh/chart: retriever-usvc-1.0.0 + app.kubernetes.io/name: retriever-usvc + app.kubernetes.io/instance: chatqna + app.kubernetes.io/version: "v1.0" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 7000 + targetPort: 7000 + protocol: TCP + name: retriever-usvc + selector: + app.kubernetes.io/name: retriever-usvc + app.kubernetes.io/instance: chatqna +--- +# Source: chatqna/charts/tei/templates/service.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: Service +metadata: + name: chatqna-tei + labels: + helm.sh/chart: tei-1.0.0 + app.kubernetes.io/name: tei + app.kubernetes.io/instance: chatqna + app.kubernetes.io/version: "cpu-1.5" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 80 + targetPort: 2081 + protocol: TCP + name: tei + selector: + app.kubernetes.io/name: tei + app.kubernetes.io/instance: chatqna +--- +# Source: chatqna/charts/teirerank/templates/service.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: Service +metadata: + name: chatqna-teirerank + labels: + helm.sh/chart: teirerank-1.0.0 + app.kubernetes.io/name: teirerank + app.kubernetes.io/instance: chatqna + app.kubernetes.io/version: "cpu-1.5" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 80 + targetPort: 2082 + protocol: TCP + name: teirerank + selector: + app.kubernetes.io/name: teirerank + app.kubernetes.io/instance: chatqna +--- +# Source: chatqna/charts/tgi/templates/service.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: Service +metadata: + name: chatqna-tgi + labels: + helm.sh/chart: tgi-1.0.0 + app.kubernetes.io/name: tgi + app.kubernetes.io/instance: chatqna + app.kubernetes.io/version: "2.1.0" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 80 + targetPort: 2080 + protocol: TCP + name: tgi + selector: + app.kubernetes.io/name: tgi + app.kubernetes.io/instance: chatqna +--- +# Source: chatqna/templates/nginx-deployment.yaml +apiVersion: v1 +kind: Service +metadata: + name: chatqna-nginx +spec: + ports: + - port: 80 + protocol: TCP + targetPort: 80 + selector: + app.kubernetes.io/name: chatqna + app.kubernetes.io/instance: chatqna + app: chatqna-nginx + type: NodePort +--- +# Source: chatqna/templates/service.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: Service +metadata: + name: chatqna + labels: + helm.sh/chart: chatqna-1.0.0 + app.kubernetes.io/name: chatqna + app.kubernetes.io/instance: chatqna + app.kubernetes.io/version: "v1.0" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 8888 + targetPort: 8888 + protocol: TCP + name: chatqna + selector: + app.kubernetes.io/name: chatqna + app.kubernetes.io/instance: chatqna + app: chatqna +--- +# Source: chatqna/charts/chatqna-ui/templates/deployment.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: chatqna-chatqna-ui + labels: + helm.sh/chart: chatqna-ui-1.0.0 + app.kubernetes.io/name: chatqna-ui + app.kubernetes.io/instance: chatqna + app.kubernetes.io/version: "v1.0" + app.kubernetes.io/managed-by: Helm +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: chatqna-ui + app.kubernetes.io/instance: chatqna + template: + metadata: + labels: + helm.sh/chart: chatqna-ui-1.0.0 + app.kubernetes.io/name: chatqna-ui + app.kubernetes.io/instance: chatqna + app.kubernetes.io/version: "v1.0" + app.kubernetes.io/managed-by: Helm + spec: + securityContext: + {} + containers: + - name: chatqna-ui + envFrom: + - configMapRef: + name: chatqna-chatqna-ui-config + securityContext: + {} + image: "opea/chatqna-conversation-ui:latest" + imagePullPolicy: IfNotPresent + ports: + - name: ui + containerPort: 80 + protocol: TCP + resources: + {} + volumeMounts: + - mountPath: /tmp + name: tmp + volumes: + - name: tmp + emptyDir: {} +--- +# Source: chatqna/charts/data-prep/templates/deployment.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: chatqna-data-prep + labels: + helm.sh/chart: data-prep-1.0.0 + app.kubernetes.io/name: data-prep + app.kubernetes.io/instance: chatqna + app.kubernetes.io/version: "v1.0" + app.kubernetes.io/managed-by: Helm +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: data-prep + app.kubernetes.io/instance: chatqna + template: + metadata: + labels: + app.kubernetes.io/name: data-prep + app.kubernetes.io/instance: chatqna + spec: + securityContext: + {} + containers: + - name: chatqna + envFrom: + - configMapRef: + name: chatqna-data-prep-config + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: false + runAsNonRoot: true + runAsUser: 1000 + seccompProfile: + type: RuntimeDefault + image: "opea/dataprep-redis:latest" + imagePullPolicy: IfNotPresent + ports: + - name: data-prep + containerPort: 6007 + protocol: TCP + volumeMounts: + - mountPath: /tmp + name: tmp + livenessProbe: + failureThreshold: 24 + httpGet: + path: v1/health_check + port: data-prep + initialDelaySeconds: 5 + periodSeconds: 5 + readinessProbe: + httpGet: + path: v1/health_check + port: data-prep + initialDelaySeconds: 5 + periodSeconds: 5 + startupProbe: + failureThreshold: 120 + httpGet: + path: v1/health_check + port: data-prep + initialDelaySeconds: 5 + periodSeconds: 5 + resources: + {} + volumes: + - name: tmp + emptyDir: {} +--- +# Source: chatqna/charts/embedding-usvc/templates/deployment.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: chatqna-embedding-usvc + labels: + helm.sh/chart: embedding-usvc-1.0.0 + app.kubernetes.io/name: embedding-usvc + app.kubernetes.io/instance: chatqna + app.kubernetes.io/version: "v1.0" + app.kubernetes.io/managed-by: Helm +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: embedding-usvc + app.kubernetes.io/instance: chatqna + template: + metadata: + labels: + app.kubernetes.io/name: embedding-usvc + app.kubernetes.io/instance: chatqna + spec: + securityContext: + {} + containers: + - name: chatqna + envFrom: + - configMapRef: + name: chatqna-embedding-usvc-config + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsNonRoot: true + runAsUser: 1000 + seccompProfile: + type: RuntimeDefault + image: "opea/embedding-tei:latest" + imagePullPolicy: IfNotPresent + ports: + - name: embedding-usvc + containerPort: 6000 + protocol: TCP + volumeMounts: + - mountPath: /tmp + name: tmp + livenessProbe: + failureThreshold: 24 + httpGet: + path: v1/health_check + port: embedding-usvc + initialDelaySeconds: 5 + periodSeconds: 5 + readinessProbe: + httpGet: + path: v1/health_check + port: embedding-usvc + initialDelaySeconds: 5 + periodSeconds: 5 + startupProbe: + failureThreshold: 120 + httpGet: + path: v1/health_check + port: embedding-usvc + initialDelaySeconds: 5 + periodSeconds: 5 + resources: + {} + volumes: + - name: tmp + emptyDir: {} +--- +# Source: chatqna/charts/llm-uservice/templates/deployment.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: chatqna-llm-uservice + labels: + helm.sh/chart: llm-uservice-1.0.0 + app.kubernetes.io/name: llm-uservice + app.kubernetes.io/instance: chatqna + app.kubernetes.io/version: "v1.0" + app.kubernetes.io/managed-by: Helm +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: llm-uservice + app.kubernetes.io/instance: chatqna + template: + metadata: + labels: + app.kubernetes.io/name: llm-uservice + app.kubernetes.io/instance: chatqna + spec: + securityContext: + {} + containers: + - name: chatqna + envFrom: + - configMapRef: + name: chatqna-llm-uservice-config + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: false + runAsNonRoot: true + runAsUser: 1000 + seccompProfile: + type: RuntimeDefault + image: "opea/llm-tgi:latest" + imagePullPolicy: IfNotPresent + ports: + - name: llm-uservice + containerPort: 9000 + protocol: TCP + volumeMounts: + - mountPath: /tmp + name: tmp + livenessProbe: + failureThreshold: 24 + httpGet: + path: v1/health_check + port: llm-uservice + initialDelaySeconds: 5 + periodSeconds: 5 + readinessProbe: + httpGet: + path: v1/health_check + port: llm-uservice + initialDelaySeconds: 5 + periodSeconds: 5 + startupProbe: + failureThreshold: 120 + httpGet: + path: v1/health_check + port: llm-uservice + initialDelaySeconds: 5 + periodSeconds: 5 + resources: + {} + volumes: + - name: tmp + emptyDir: {} +--- +# Source: chatqna/charts/redis-vector-db/templates/deployment.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: chatqna-redis-vector-db + labels: + helm.sh/chart: redis-vector-db-1.0.0 + app.kubernetes.io/name: redis-vector-db + app.kubernetes.io/instance: chatqna + app.kubernetes.io/version: "7.2.0-v9" + app.kubernetes.io/managed-by: Helm +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: redis-vector-db + app.kubernetes.io/instance: chatqna + template: + metadata: + labels: + app.kubernetes.io/name: redis-vector-db + app.kubernetes.io/instance: chatqna + spec: + securityContext: + {} + containers: + - name: redis-vector-db + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsNonRoot: true + runAsUser: 1000 + seccompProfile: + type: RuntimeDefault + image: "redis/redis-stack:7.2.0-v9" + imagePullPolicy: IfNotPresent + volumeMounts: + - mountPath: /data + name: data-volume + - mountPath: /redisinsight + name: redisinsight-volume + - mountPath: /tmp + name: tmp + ports: + - name: redis-service + containerPort: 6379 + protocol: TCP + - name: redis-insight + containerPort: 8001 + protocol: TCP + startupProbe: + tcpSocket: + port: 6379 # Probe the Redis port + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 120 + resources: + {} + volumes: + - name: data-volume + emptyDir: {} + - name: redisinsight-volume + emptyDir: {} + - name: tmp + emptyDir: {} +--- +# Source: chatqna/charts/reranking-usvc/templates/deployment.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: chatqna-reranking-usvc + labels: + helm.sh/chart: reranking-usvc-1.0.0 + app.kubernetes.io/name: reranking-usvc + app.kubernetes.io/instance: chatqna + app.kubernetes.io/version: "v1.0" + app.kubernetes.io/managed-by: Helm +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: reranking-usvc + app.kubernetes.io/instance: chatqna + template: + metadata: + labels: + app.kubernetes.io/name: reranking-usvc + app.kubernetes.io/instance: chatqna + spec: + securityContext: + {} + containers: + - name: chatqna + envFrom: + - configMapRef: + name: chatqna-reranking-usvc-config + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsNonRoot: true + runAsUser: 1000 + seccompProfile: + type: RuntimeDefault + image: "opea/reranking-tei:latest" + imagePullPolicy: IfNotPresent + ports: + - name: reranking-usvc + containerPort: 8000 + protocol: TCP + volumeMounts: + - mountPath: /tmp + name: tmp + livenessProbe: + failureThreshold: 24 + httpGet: + path: v1/health_check + port: reranking-usvc + initialDelaySeconds: 5 + periodSeconds: 5 + readinessProbe: + httpGet: + path: v1/health_check + port: reranking-usvc + initialDelaySeconds: 5 + periodSeconds: 5 + startupProbe: + failureThreshold: 120 + httpGet: + path: v1/health_check + port: reranking-usvc + initialDelaySeconds: 5 + periodSeconds: 5 + resources: + {} + volumes: + - name: tmp + emptyDir: {} +--- +# Source: chatqna/charts/retriever-usvc/templates/deployment.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: chatqna-retriever-usvc + labels: + helm.sh/chart: retriever-usvc-1.0.0 + app.kubernetes.io/name: retriever-usvc + app.kubernetes.io/instance: chatqna + app.kubernetes.io/version: "v1.0" + app.kubernetes.io/managed-by: Helm +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: retriever-usvc + app.kubernetes.io/instance: chatqna + template: + metadata: + labels: + app.kubernetes.io/name: retriever-usvc + app.kubernetes.io/instance: chatqna + spec: + securityContext: + {} + containers: + - name: chatqna + envFrom: + - configMapRef: + name: chatqna-retriever-usvc-config + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsNonRoot: true + runAsUser: 1000 + seccompProfile: + type: RuntimeDefault + image: "opea/retriever-redis:latest" + imagePullPolicy: IfNotPresent + ports: + - name: retriever-usvc + containerPort: 7000 + protocol: TCP + volumeMounts: + - mountPath: /tmp + name: tmp + livenessProbe: + failureThreshold: 24 + httpGet: + path: v1/health_check + port: retriever-usvc + initialDelaySeconds: 5 + periodSeconds: 5 + readinessProbe: + httpGet: + path: v1/health_check + port: retriever-usvc + initialDelaySeconds: 5 + periodSeconds: 5 + startupProbe: + failureThreshold: 120 + httpGet: + path: v1/health_check + port: retriever-usvc + initialDelaySeconds: 5 + periodSeconds: 5 + resources: + {} + volumes: + - name: tmp + emptyDir: {} +--- +# Source: chatqna/charts/tei/templates/deployment.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: chatqna-tei + labels: + helm.sh/chart: tei-1.0.0 + app.kubernetes.io/name: tei + app.kubernetes.io/instance: chatqna + app.kubernetes.io/version: "cpu-1.5" + app.kubernetes.io/managed-by: Helm +spec: + # use explicit replica counts only of HorizontalPodAutoscaler is disabled + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: tei + app.kubernetes.io/instance: chatqna + template: + metadata: + labels: + app.kubernetes.io/name: tei + app.kubernetes.io/instance: chatqna + spec: + securityContext: + {} + containers: + - name: tei + envFrom: + - configMapRef: + name: chatqna-tei-config + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsNonRoot: true + runAsUser: 1000 + seccompProfile: + type: RuntimeDefault + image: "ghcr.io/huggingface/text-embeddings-inference:cpu-1.5" + imagePullPolicy: IfNotPresent + args: + - "--auto-truncate" + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + - mountPath: /tmp + name: tmp + ports: + - name: http + containerPort: 2081 + protocol: TCP + livenessProbe: + failureThreshold: 24 + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + readinessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + startupProbe: + failureThreshold: 120 + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + resources: + {} + volumes: + - name: model-volume + emptyDir: {} + - name: shm + emptyDir: + medium: Memory + sizeLimit: 1Gi + - name: tmp + emptyDir: {} +--- +# Source: chatqna/charts/teirerank/templates/deployment.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: chatqna-teirerank + labels: + helm.sh/chart: teirerank-1.0.0 + app.kubernetes.io/name: teirerank + app.kubernetes.io/instance: chatqna + app.kubernetes.io/version: "cpu-1.5" + app.kubernetes.io/managed-by: Helm +spec: + # use explicit replica counts only of HorizontalPodAutoscaler is disabled + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: teirerank + app.kubernetes.io/instance: chatqna + template: + metadata: + labels: + app.kubernetes.io/name: teirerank + app.kubernetes.io/instance: chatqna + spec: + securityContext: + {} + containers: + - name: teirerank + envFrom: + - configMapRef: + name: chatqna-teirerank-config + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsNonRoot: true + runAsUser: 1000 + seccompProfile: + type: RuntimeDefault + image: "ghcr.io/huggingface/text-embeddings-inference:cpu-1.5" + imagePullPolicy: IfNotPresent + args: + - "--auto-truncate" + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + - mountPath: /tmp + name: tmp + ports: + - name: http + containerPort: 2082 + protocol: TCP + livenessProbe: + failureThreshold: 24 + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + readinessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + startupProbe: + failureThreshold: 120 + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + resources: + {} + volumes: + - name: model-volume + emptyDir: {} + - name: shm + emptyDir: + medium: Memory + sizeLimit: 1Gi + - name: tmp + emptyDir: {} +--- +# Source: chatqna/charts/tgi/templates/deployment.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: chatqna-tgi + labels: + helm.sh/chart: tgi-1.0.0 + app.kubernetes.io/name: tgi + app.kubernetes.io/instance: chatqna + app.kubernetes.io/version: "2.1.0" + app.kubernetes.io/managed-by: Helm +spec: + # use explicit replica counts only of HorizontalPodAutoscaler is disabled + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: tgi + app.kubernetes.io/instance: chatqna + template: + metadata: + labels: + app.kubernetes.io/name: tgi + app.kubernetes.io/instance: chatqna + spec: + nodeSelector: + node-type: node-bfloat16 + securityContext: + {} + containers: + - name: tgi + envFrom: + - configMapRef: + name: chatqna-tgi-config + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsNonRoot: true + runAsUser: 1000 + seccompProfile: + type: RuntimeDefault + image: "ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu" + imagePullPolicy: IfNotPresent + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /tmp + name: tmp + ports: + - name: http + containerPort: 2080 + protocol: TCP + livenessProbe: + failureThreshold: 24 + initialDelaySeconds: 5 + periodSeconds: 5 + tcpSocket: + port: http + readinessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + tcpSocket: + port: http + startupProbe: + failureThreshold: 120 + initialDelaySeconds: 5 + periodSeconds: 5 + tcpSocket: + port: http + resources: + {} + volumes: + - name: model-volume + emptyDir: {} + - name: tmp + emptyDir: {} +--- +# Source: chatqna/templates/deployment.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: chatqna + labels: + helm.sh/chart: chatqna-1.0.0 + app.kubernetes.io/name: chatqna + app.kubernetes.io/instance: chatqna + app.kubernetes.io/version: "v1.0" + app.kubernetes.io/managed-by: Helm + app: chatqna +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: chatqna + app.kubernetes.io/instance: chatqna + app: chatqna + template: + metadata: + labels: + app.kubernetes.io/name: chatqna + app.kubernetes.io/instance: chatqna + app: chatqna + spec: + securityContext: + null + containers: + - name: chatqna + env: + - name: LLM_SERVICE_HOST_IP + value: chatqna-llm-uservice + - name: RERANK_SERVICE_HOST_IP + value: chatqna-reranking-usvc + - name: RETRIEVER_SERVICE_HOST_IP + value: chatqna-retriever-usvc + - name: EMBEDDING_SERVICE_HOST_IP + value: chatqna-embedding-usvc + - name: GUARDRAIL_SERVICE_HOST_IP + value: chatqna-guardrails-usvc + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsNonRoot: true + runAsUser: 1000 + seccompProfile: + type: RuntimeDefault + image: "opea/chatqna:latest" + imagePullPolicy: IfNotPresent + volumeMounts: + - mountPath: /tmp + name: tmp + ports: + - name: chatqna + containerPort: 8888 + protocol: TCP + resources: + null + volumes: + - name: tmp + emptyDir: {} +--- +# Source: chatqna/templates/nginx-deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: chatqna-nginx + labels: + helm.sh/chart: chatqna-1.0.0 + app.kubernetes.io/name: chatqna + app.kubernetes.io/instance: chatqna + app.kubernetes.io/version: "v1.0" + app.kubernetes.io/managed-by: Helm + app: chatqna-nginx +spec: + selector: + matchLabels: + app.kubernetes.io/name: chatqna + app.kubernetes.io/instance: chatqna + app: chatqna-nginx + template: + metadata: + labels: + app.kubernetes.io/name: chatqna + app.kubernetes.io/instance: chatqna + app: chatqna-nginx + spec: + containers: + - image: nginx:1.27.1 + imagePullPolicy: IfNotPresent + name: nginx + volumeMounts: + - mountPath: /etc/nginx/conf.d + name: nginx-config-volume + securityContext: {} + volumes: + - configMap: + defaultMode: 420 + name: chatqna-nginx-config + name: nginx-config-volume From be29bf3642103778116150f8c20f0ab119ef5792 Mon Sep 17 00:00:00 2001 From: lkk <33276950+lkk12014402@users.noreply.github.com> Date: Fri, 13 Sep 2024 16:33:46 +0800 Subject: [PATCH 2/2] Update ChatQnA/kubernetes/intel/README.md Co-authored-by: Eero Tamminen --- ChatQnA/kubernetes/intel/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ChatQnA/kubernetes/intel/README.md b/ChatQnA/kubernetes/intel/README.md index 81285528a..ade0536a2 100644 --- a/ChatQnA/kubernetes/intel/README.md +++ b/ChatQnA/kubernetes/intel/README.md @@ -17,7 +17,7 @@ sed -i "s/insert-your-huggingface-token-here/${HUGGINGFACEHUB_API_TOKEN}/g" chat kubectl apply -f chatqna.yaml ``` -Since CPUs, such as Intel Cooper Lake, Sapphire Rapids, support `bfloat16`, we can add `--dtype bfloat16` when setup the `huggingface/text-generation-inference` server. And if you have such CPUs, you can run the following commands: +Newer CPUs such as Intel Cooper Lake, Sapphire Rapids, support [`bfloat16` data type](https://en.wikipedia.org/wiki/Bfloat16_floating-point_format). If you have such CPUs, and given model supports `bfloat16`, adding `--dtype bfloat16` argument for `huggingface/text-generation-inference` server halves its memory usage and speeds it a bit. To use it, run the following commands: ``` # label your node for scheduling the service on it automatically