diff --git a/DocSum/kubernetes/helm/README.md b/DocSum/kubernetes/helm/README.md index aa4bf07205..64537cc06f 100644 --- a/DocSum/kubernetes/helm/README.md +++ b/DocSum/kubernetes/helm/README.md @@ -16,3 +16,150 @@ helm install docsum oci://ghcr.io/opea-project/charts/docsum --set global.HUGGI export HFTOKEN="insert-your-huggingface-token-here" helm install docsum oci://ghcr.io/opea-project/charts/docsum --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f gaudi-values.yaml ``` + +## Deploy on AMD ROCm using Helm charts from the binary Helm repository + +```bash +mkdir ~/docsum-k8s-install && cd ~/docsum-k8s-install +``` + +### Cloning repos + +```bash +git clone git clone https://github.com/opea-project/GenAIExamples.git +``` + +### Go to the installation directory + +```bash +cd GenAIExamples/DocSum/kubernetes/helm +``` + +### Settings system variables + +```bash +export HFTOKEN="your_huggingface_token" +export MODELDIR="/mnt/opea-models" +export MODELNAME="Intel/neural-chat-7b-v3-3" +``` + +### Setting variables in Values files + +#### If ROCm vLLM used +```bash +nano ~/docsum-k8s-install/GenAIExamples/DocSum/kubernetes/helm/rocm-values.yaml +``` + +- HIP_VISIBLE_DEVICES - this variable specifies the ID of the GPU that you want to use. + You can specify either one or several comma-separated ones - "0" or "0,1,2,3" +- TENSOR_PARALLEL_SIZE - must match the number of GPUs used +- resources: + limits: + amd.com/gpu: "1" - replace "1" with the number of GPUs used + +#### If ROCm TGI used + +```bash +nano ~/docsum-k8s-install/GenAIExamples/DocSum/kubernetes/helm/rocm-tgi-values.yaml +``` + +- HIP_VISIBLE_DEVICES - this variable specifies the ID of the GPU that you want to use. + You can specify either one or several comma-separated ones - "0" or "0,1,2,3" +- extraCmdArgs: [ "--num-shard","1" ] - replace "1" with the number of GPUs used +- resources: + limits: + amd.com/gpu: "1" - replace "1" with the number of GPUs used + +### Installing the Helm Chart + +#### If ROCm vLLM used +```bash +helm upgrade --install docsum oci://ghcr.io/opea-project/charts/docsum \ + --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} \ + --values rocm-values.yaml +``` + +#### If ROCm TGI used +```bash +helm upgrade --install docsum oci://ghcr.io/opea-project/charts/docsum \ + --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} \ + --values rocm-tgi-values.yaml +``` + +## Deploy on AMD ROCm using Helm charts from Git repositories + +### Creating working dirs + +```bash +mkdir ~/docsum-k8s-install && cd ~/docsum-k8s-install +``` + +### Cloning repos + +```bash +git clone git clone https://github.com/opea-project/GenAIExamples.git +git clone git clone https://github.com/opea-project/GenAIInfra.git +``` + +### Go to the installation directory + +```bash +cd GenAIExamples/DocSum/kubernetes/helm +``` + +### Settings system variables + +```bash +export HFTOKEN="your_huggingface_token" +export MODELDIR="/mnt/opea-models" +export MODELNAME="Intel/neural-chat-7b-v3-3" +``` + +### Setting variables in Values files + +#### If ROCm vLLM used +```bash +nano ~/docsum-k8s-install/GenAIExamples/DocSum/kubernetes/helm/rocm-values.yaml +``` + +- HIP_VISIBLE_DEVICES - this variable specifies the ID of the GPU that you want to use. +You can specify either one or several comma-separated ones - "0" or "0,1,2,3" +- TENSOR_PARALLEL_SIZE - must match the number of GPUs used +- resources: + limits: + amd.com/gpu: "1" - replace "1" with the number of GPUs used + +#### If ROCm TGI used + +```bash +nano ~/docsum-k8s-install/GenAIExamples/DocSum/kubernetes/helm/rocm-tgi-values.yaml +``` + +- HIP_VISIBLE_DEVICES - this variable specifies the ID of the GPU that you want to use. + You can specify either one or several comma-separated ones - "0" or "0,1,2,3" +- extraCmdArgs: [ "--num-shard","1" ] - replace "1" with the number of GPUs used +- resources: + limits: + amd.com/gpu: "1" - replace "1" with the number of GPUs used + +### Installing the Helm Chart + +#### If ROCm vLLM used +```bash +cd ~/docsum-k8s-install/GenAIInfra/helm-charts +./update_dependency.sh +helm dependency update docsum +helm upgrade --install docsum docsum \ + --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} \ + --values ../../GenAIExamples/DocSum/kubernetes/helm/rocm-values.yaml +``` + +#### If ROCm TGI used +```bash +cd ~/docsum-k8s-install/GenAIInfra/helm-charts +./update_dependency.sh +helm dependency update docsum +helm upgrade --install docsum docsum \ + --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} \ + --values ../../GenAIExamples/DocSum/kubernetes/helm/rocm-tgi-values.yaml +``` diff --git a/DocSum/kubernetes/helm/rocm-tgi-values.yaml b/DocSum/kubernetes/helm/rocm-tgi-values.yaml new file mode 100644 index 0000000000..d3b5e49722 --- /dev/null +++ b/DocSum/kubernetes/helm/rocm-tgi-values.yaml @@ -0,0 +1,45 @@ +# Copyright (C) 2025 Advanced Micro Devices, Inc. + +tgi: + enabled: true + accelDevice: "rocm" + image: + repository: ghcr.io/huggingface/text-generation-inference + tag: "2.4.1-rocm" + MAX_INPUT_LENGTH: "1024" + MAX_TOTAL_TOKENS: "2048" + USE_FLASH_ATTENTION: "false" + FLASH_ATTENTION_RECOMPUTE: "false" + HIP_VISIBLE_DEVICES: "0" + MAX_BATCH_SIZE: "4" + extraCmdArgs: [ "--num-shard","1" ] + resources: + limits: + amd.com/gpu: "1" + requests: + cpu: 1 + memory: 16Gi + securityContext: + readOnlyRootFilesystem: false + runAsNonRoot: false + runAsUser: 0 + capabilities: + add: + - SYS_PTRACE + readinessProbe: + initialDelaySeconds: 60 + periodSeconds: 5 + timeoutSeconds: 1 + failureThreshold: 120 + startupProbe: + initialDelaySeconds: 60 + periodSeconds: 5 + timeoutSeconds: 1 + failureThreshold: 120 + +llm-uservice: + DOCSUM_BACKEND: "TGI" + retryTimeoutSeconds: 720 + +vllm: + enabled: false diff --git a/DocSum/kubernetes/helm/rocm-values.yaml b/DocSum/kubernetes/helm/rocm-values.yaml new file mode 100644 index 0000000000..7236f50bd7 --- /dev/null +++ b/DocSum/kubernetes/helm/rocm-values.yaml @@ -0,0 +1,40 @@ +# Copyright (C) 2025 Advanced Micro Devices, Inc. + +tgi: + enabled: false + +llm-uservice: + DOCSUM_BACKEND: "vLLM" + retryTimeoutSeconds: 720 + +vllm: + enabled: true + accelDevice: "rocm" + image: + repository: opea/vllm-rocm + tag: latest + env: + HIP_VISIBLE_DEVICES: "0" + TENSOR_PARALLEL_SIZE: "1" + HF_HUB_DISABLE_PROGRESS_BARS: "1" + HF_HUB_ENABLE_HF_TRANSFER: "0" + VLLM_USE_TRITON_FLASH_ATTN: "0" + VLLM_WORKER_MULTIPROC_METHOD: "spawn" + PYTORCH_JIT: "0" + HF_HOME: "/data" + extraCmd: + command: [ "python3", "/workspace/api_server.py" ] + extraCmdArgs: [ "--swap-space", "16", + "--disable-log-requests", + "--dtype", "float16", + "--num-scheduler-steps", "1", + "--distributed-executor-backend", "mp" ] + resources: + limits: + amd.com/gpu: "1" + startupProbe: + failureThreshold: 180 + securityContext: + readOnlyRootFilesystem: false + runAsNonRoot: false + runAsUser: 0