Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions helm-charts/common/tgi/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,14 @@ Helm chart for deploying Hugging Face Text Generation Inference service.
To install the chart, run the following:

```console
cd GenAIInfra/helm-charts/common
cd GenAIInfra/helm-charts/common/tgi
export MODELDIR=/mnt/opea-models
export MODELNAME="Intel/neural-chat-7b-v3-3"
export HFTOKEN="insert-your-huggingface-token-here"
helm install tgi tgi --set global.modelUseHostPath=${MODELDIR} --set LLM_MODEL_ID=${MODELNAME} --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN}
helm dependency update
helm install tgi . --set global.modelUseHostPath=${MODELDIR} --set LLM_MODEL_ID=${MODELNAME} --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN}
# To deploy on Gaudi enabled kubernetes cluster
# helm install tgi tgi --set global.modelUseHostPath=${MODELDIR} --set LLM_MODEL_ID=${MODELNAME} --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --values gaudi-values.yaml
# helm install tgi . --set global.modelUseHostPath=${MODELDIR} --set LLM_MODEL_ID=${MODELNAME} --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --values gaudi-values.yaml
```

By default, the tgi service will downloading the "Intel/neural-chat-7b-v3-3" which is about 54GB.
Expand Down
36 changes: 36 additions & 0 deletions helm-charts/common/tgi/rocm-values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Copyright (C) 2025 Advanced Micro Devices, Inc.

accelDevice: "rocm"
image:
repository: ghcr.io/huggingface/text-generation-inference
tag: "2.4.1-rocm"
MAX_INPUT_LENGTH: "1024"
MAX_TOTAL_TOKENS: "2048"
USE_FLASH_ATTENTION: "false"
FLASH_ATTENTION_RECOMPUTE: "false"
HIP_VISIBLE_DEVICES: "0"
MAX_BATCH_SIZE: "4"
extraCmdArgs: ["--num-shard","1"]
resources:
limits:
amd.com/gpu: "1"
requests:
cpu: 1
memory: 16Gi
securityContext:
readOnlyRootFilesystem: false
runAsNonRoot: false
runAsUser: 0
capabilities:
add:
- SYS_PTRACE
readinessProbe:
initialDelaySeconds: 60
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 120
startupProbe:
initialDelaySeconds: 60
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 120
6 changes: 6 additions & 0 deletions helm-charts/common/tgi/templates/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,9 @@ data:
{{- if .Values.BATCH_BUCKET_SIZE }}
BATCH_BUCKET_SIZE: {{ .Values.BATCH_BUCKET_SIZE | quote }}
{{- end }}
{{- if .Values.HIP_VISIBLE_DEVICES }}
HIP_VISIBLE_DEVICES: {{ .Values.HIP_VISIBLE_DEVICES | quote }}
{{- end }}
{{- if .Values.MAX_BATCH_SIZE }}
MAX_BATCH_SIZE: {{ .Values.MAX_BATCH_SIZE | quote }}
{{- end }}
8 changes: 5 additions & 3 deletions helm-charts/common/vllm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,15 @@ To install the chart, run the following:
Note that you cannot use vllm as the service release name due to [environment variables conflict](https://docs.vllm.ai/en/stable/serving/env_vars.html#environment-variables).

```console
cd GenAIInfra/helm-charts/common
cd GenAIInfra/helm-charts/common/vllm
export MODELDIR=/mnt/opea-models
export MODELNAME="Intel/neural-chat-7b-v3-3"
export HFTOKEN="insert-your-huggingface-token-here"
helm install myvllm vllm --set global.modelUseHostPath=${MODELDIR} --set LLM_MODEL_ID=${MODELNAME} --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN}
helm install myvllm . --set global.modelUseHostPath=${MODELDIR} --set LLM_MODEL_ID=${MODELNAME} --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN}
# To deploy on Gaudi enabled kubernetes cluster
# helm install myvllm vllm --set global.modelUseHostPath=${MODELDIR} --set LLM_MODEL_ID=${MODELNAME} --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --values gaudi-values.yaml
# helm install myvllm . --set global.modelUseHostPath=${MODELDIR} --set LLM_MODEL_ID=${MODELNAME} --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --values gaudi-values.yaml
# To deploy on AMD ROCm GPU kubernetes cluster
# helm install vllm-rocm . --set global.modelUseHostPath=${MODELDIR} --set LLM_MODEL_ID=${MODELNAME} --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --values rocm-values.yaml
```

By default, the vllm service will downloading the "Intel/neural-chat-7b-v3-3".
Expand Down
31 changes: 31 additions & 0 deletions helm-charts/common/vllm/rocm-values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Copyright (C) 2025 Advanced Micro Devices, Inc.

accelDevice: "rocm"
image:
repository: opea/vllm-rocm
tag: latest
env:
HIP_VISIBLE_DEVICES: "0"
TENSOR_PARALLEL_SIZE: "1"
HF_HUB_DISABLE_PROGRESS_BARS: "1"
HF_HUB_ENABLE_HF_TRANSFER: "0"
VLLM_USE_TRITON_FLASH_ATTN: "0"
VLLM_WORKER_MULTIPROC_METHOD: "spawn"
PYTORCH_JIT: "0"
HF_HOME: "/data"
extraCmd:
command: ["python3", "/workspace/api_server.py"]
extraCmdArgs: ["--swap-space", "16",
"--disable-log-requests",
"--dtype", "float16",
"--num-scheduler-steps", "1",
"--distributed-executor-backend", "mp"]
resources:
limits:
amd.com/gpu: "1"
startupProbe:
failureThreshold: 180
securityContext:
readOnlyRootFilesystem: false
runAsNonRoot: false
runAsUser: 0
5 changes: 5 additions & 0 deletions helm-charts/common/vllm/templates/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,8 @@ data:
{{- if .Values.PT_HPUGRAPH_DISABLE_TENSOR_CACHE }}
PT_HPUGRAPH_DISABLE_TENSOR_CACHE: {{ .Values.PT_HPUGRAPH_DISABLE_TENSOR_CACHE | quote }}
{{- end }}
{{- if .Values.env }}
{{- range $k,$v := .Values.env }}
{{ tpl (toString $k ) $ | trim }}: {{ tpl (toString $v ) $ | trim | quote }}
{{- end -}}
{{- end }}
6 changes: 6 additions & 0 deletions helm-charts/common/vllm/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,12 @@ spec:
{{- if .Values.image.pullPolicy }}
imagePullPolicy: {{ .Values.image.pullPolicy }}
{{- end }}
{{- if .Values.extraCmd }}
command:
{{- range .Values.extraCmd.command }}
- {{ . | quote }}
{{- end }}
{{- end }}
args:
{{- if .Values.extraCmdArgs }}
{{- range .Values.extraCmdArgs }}
Expand Down
11 changes: 9 additions & 2 deletions helm-charts/docsum/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,24 @@ DocSum depends on LLM microservice, refer to llm-uservice for more config detail
To install the chart, run the following:

```console
git clone https://github.com/opea-project/GenAIInfra.git
cd GenAIInfra/helm-charts/
mkdir /mnt/opea-models && chmod -R 664 /mnt/opea-models
./update_dependency.sh
helm dependency update docsum
export HFTOKEN="insert-your-huggingface-token-here"
export MODELDIR="/mnt/opea-models"
export MODELNAME="Intel/neural-chat-7b-v3-3"
helm install docsum docsum --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set llm-uservice.LLM_MODEL_ID=${MODELNAME} --set vllm.LLM_MODEL_ID=${MODELNAME}
# To use Gaudi device with vLLM
# helm install docsum docsum --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --values docsum/gaudi-values.yaml ...
# helm install docsum docsum --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --values docsum/gaudi-values.yaml
# To use Gaudi device with TGI
# helm install docsum docsum --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --values docsum/gaudi-tgi-values.yaml ..
# helm install docsum docsum --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --values docsum/gaudi-tgi-values.yaml
# To use AMD ROCm device with vLLM
# helm install docsum docsum --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --values docsum/rocm-values.yaml
# To use AMD ROCm device with TGI
# helm install docsum docsum --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --values docsum/rocm-tgi-values.yaml

```

## Verify
Expand Down
45 changes: 45 additions & 0 deletions helm-charts/docsum/rocm-tgi-values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Copyright (C) 2025 Advanced Micro Devices, Inc.

tgi:
enabled: true
accelDevice: "rocm"
image:
repository: ghcr.io/huggingface/text-generation-inference
tag: "2.4.1-rocm"
MAX_INPUT_LENGTH: "1024"
MAX_TOTAL_TOKENS: "2048"
USE_FLASH_ATTENTION: "false"
FLASH_ATTENTION_RECOMPUTE: "false"
HIP_VISIBLE_DEVICES: "0"
MAX_BATCH_SIZE: "4"
extraCmdArgs: [ "--num-shard","1" ]
resources:
limits:
amd.com/gpu: "1"
requests:
cpu: 1
memory: 16Gi
securityContext:
readOnlyRootFilesystem: false
runAsNonRoot: false
runAsUser: 0
capabilities:
add:
- SYS_PTRACE
readinessProbe:
initialDelaySeconds: 60
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 120
startupProbe:
initialDelaySeconds: 60
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 120

llm-uservice:
DOCSUM_BACKEND: "TGI"
retryTimeoutSeconds: 720

vllm:
enabled: false
40 changes: 40 additions & 0 deletions helm-charts/docsum/rocm-values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Copyright (C) 2025 Advanced Micro Devices, Inc.

tgi:
enabled: false

llm-uservice:
DOCSUM_BACKEND: "vLLM"
retryTimeoutSeconds: 720

vllm:
enabled: true
accelDevice: "rocm"
image:
repository: opea/vllm-rocm
tag: latest
env:
HIP_VISIBLE_DEVICES: "0"
TENSOR_PARALLEL_SIZE: "1"
HF_HUB_DISABLE_PROGRESS_BARS: "1"
HF_HUB_ENABLE_HF_TRANSFER: "0"
VLLM_USE_TRITON_FLASH_ATTN: "0"
VLLM_WORKER_MULTIPROC_METHOD: "spawn"
PYTORCH_JIT: "0"
HF_HOME: "/data"
extraCmd:
command: [ "python3", "/workspace/api_server.py" ]
extraCmdArgs: [ "--swap-space", "16",
"--disable-log-requests",
"--dtype", "float16",
"--num-scheduler-steps", "1",
"--distributed-executor-backend", "mp" ]
resources:
limits:
amd.com/gpu: "1"
startupProbe:
failureThreshold: 180
securityContext:
readOnlyRootFilesystem: false
runAsNonRoot: false
runAsUser: 0
1 change: 0 additions & 1 deletion helm-charts/update_dependency.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/bin/bash

# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

Expand Down