From ae3b274dd35b22f7f1c78189e5b060565dc64008 Mon Sep 17 00:00:00 2001
From: tianyil1 <tianyi.liu@intel.com>
Date: Thu, 30 May 2024 14:43:14 +0800
Subject: [PATCH 1/8] added the vllm gaudi build docker files

Signed-off-by: tianyil1 <tianyi.liu@intel.com>
---
 .../llms/text-generation/vllm/build_docker.sh | 40 +++++++++++++++++++
 .../text-generation/vllm/build_docker_cpu.sh  |  9 -----
 .../vllm/docker/Dockerfile.hpu                | 16 ++++++++
 3 files changed, 56 insertions(+), 9 deletions(-)
 create mode 100644 comps/llms/text-generation/vllm/build_docker.sh
 delete mode 100644 comps/llms/text-generation/vllm/build_docker_cpu.sh
 create mode 100644 comps/llms/text-generation/vllm/docker/Dockerfile.hpu

diff --git a/comps/llms/text-generation/vllm/build_docker.sh b/comps/llms/text-generation/vllm/build_docker.sh
new file mode 100644
index 000000000..0fed539d2
--- /dev/null
+++ b/comps/llms/text-generation/vllm/build_docker.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Set default values
+default_hw_mode="cpu"
+
+# Assign arguments to variable
+hw_mode=${1:-$default_hw_mode}
+
+# Check if all required arguments are provided
+if [ "$#" -lt 0 ] || [ "$#" -gt 1 ]; then
+    echo "Usage: $0 [hw_mode]"
+    echo "Please customize the arguments you want to use.
+    - hw_mode: The hardware mode for the Ray Gaudi endpoint, with the default being 'cpu', and the optional selection can be 'cpu' and 'hpu'."
+    exit 1
+fi
+
+# Build the docker image for vLLM based on the hardware mode
+if [ "$hw_mode" = "cpu" ]; then
+    git clone https://github.com/vllm-project/vllm.git
+    cd ./vllm/
+    docker build -f Dockerfile.cpu -t vllm:cpu --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
+elif [ "$hw_mode" = "hpu" ]; then
+    git clone https://github.com/HabanaAI/vllm-fork.git
+    cd vllm-fork/ && git checkout v0.4.2-Gaudi-1.16.0
+    cd ../docker && docker build -f Dockerfile.hpu -t vllm:hpu --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
+
diff --git a/comps/llms/text-generation/vllm/build_docker_cpu.sh b/comps/llms/text-generation/vllm/build_docker_cpu.sh
deleted file mode 100644
index 487c4221b..000000000
--- a/comps/llms/text-generation/vllm/build_docker_cpu.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-git clone https://github.com/vllm-project/vllm.git
-cd ./vllm/
-docker build -f Dockerfile.cpu -t vllm:cpu --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
diff --git a/comps/llms/text-generation/vllm/docker/Dockerfile.hpu b/comps/llms/text-generation/vllm/docker/Dockerfile.hpu
new file mode 100644
index 000000000..28919883a
--- /dev/null
+++ b/comps/llms/text-generation/vllm/docker/Dockerfile.hpu
@@ -0,0 +1,16 @@
+FROM vault.habana.ai/gaudi-docker/1.15.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
+
+ENV LANG=en_US.UTF-8
+
+COPY ../vllm-fork/ /root/vllm-fork
+
+WORKDIR /root/vllm-fork
+
+RUN pip install -e .
+
+RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \
+    service ssh restart
+
+ENV no_proxy=localhost,127.0.0.1
+
+CMD ["/bin/bash"]
\ No newline at end of file

From 23f28fe31a82fc490e2d300fda3900839693f825 Mon Sep 17 00:00:00 2001
From: tianyil1 <tianyi.liu@intel.com>
Date: Fri, 31 May 2024 16:06:48 +0800
Subject: [PATCH 2/8] refine the docker and add the launch vllm script

Signed-off-by: tianyil1 <tianyi.liu@intel.com>
---
 comps/llms/text-generation/vllm/build_docker.sh | 10 ++++------
 .../text-generation/vllm/docker/Dockerfile.hpu  | 10 +++++++---
 .../text-generation/vllm/launch_vllm_service.sh | 17 +++++++++++++----
 3 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/comps/llms/text-generation/vllm/build_docker.sh b/comps/llms/text-generation/vllm/build_docker.sh
index 0fed539d2..b1d6fba14 100644
--- a/comps/llms/text-generation/vllm/build_docker.sh
+++ b/comps/llms/text-generation/vllm/build_docker.sh
@@ -29,12 +29,10 @@ if [ "$#" -lt 0 ] || [ "$#" -gt 1 ]; then
 fi
 
 # Build the docker image for vLLM based on the hardware mode
-if [ "$hw_mode" = "cpu" ]; then
+if [ "$hw_mode" = "hpu" ]; then
+    docker build -f docker/Dockerfile.hpu -t vllm:hpu --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
+else
     git clone https://github.com/vllm-project/vllm.git
     cd ./vllm/
     docker build -f Dockerfile.cpu -t vllm:cpu --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
-elif [ "$hw_mode" = "hpu" ]; then
-    git clone https://github.com/HabanaAI/vllm-fork.git
-    cd vllm-fork/ && git checkout v0.4.2-Gaudi-1.16.0
-    cd ../docker && docker build -f Dockerfile.hpu -t vllm:hpu --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
-
+fi
\ No newline at end of file
diff --git a/comps/llms/text-generation/vllm/docker/Dockerfile.hpu b/comps/llms/text-generation/vllm/docker/Dockerfile.hpu
index 28919883a..d006b4442 100644
--- a/comps/llms/text-generation/vllm/docker/Dockerfile.hpu
+++ b/comps/llms/text-generation/vllm/docker/Dockerfile.hpu
@@ -2,15 +2,19 @@ FROM vault.habana.ai/gaudi-docker/1.15.1/ubuntu22.04/habanalabs/pytorch-installe
 
 ENV LANG=en_US.UTF-8
 
-COPY ../vllm-fork/ /root/vllm-fork
+WORKDIR /root
 
-WORKDIR /root/vllm-fork
+RUN pip install --upgrade-strategy eager optimum[habana]
 
-RUN pip install -e .
+RUN pip install -v git+https://github.com/HabanaAI/vllm-fork.git@ae3d6121
 
 RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \
     service ssh restart
 
 ENV no_proxy=localhost,127.0.0.1
 
+ENV PT_HPU_LAZY_ACC_PAR_MODE=0
+
+ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
+
 CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/comps/llms/text-generation/vllm/launch_vllm_service.sh b/comps/llms/text-generation/vllm/launch_vllm_service.sh
index c6fc04210..18c2efd3e 100644
--- a/comps/llms/text-generation/vllm/launch_vllm_service.sh
+++ b/comps/llms/text-generation/vllm/launch_vllm_service.sh
@@ -6,20 +6,29 @@
 
 # Set default values
 default_port=8080
+default_hw_mode="cpu"
 default_model="mistralai/Mistral-7B-v0.1"
 
 # Assign arguments to variables
 port_number=${1:-$default_port}
 model_name=${2:-$default_model}
+hw_mode=${3:-$default_hw_mode}
 
 # Check if all required arguments are provided
-if [ "$#" -lt 0 ] || [ "$#" -gt 2 ]; then
-    echo "Usage: $0 [port_number] [model_name]"
+if [ "$#" -lt 0 ] || [ "$#" -gt 3 ]; then
+    echo "Usage: $0 [port_number] [model_name] [hw_mode]"
+    echo "port_number: The port number assigned to the vLLM CPU endpoint, with the default being 8080."
+    echo "model_name: The model name utilized for LLM, with the default set to 'mistralai/Mistral-7B-v0.1'."
+    echo "hw_mode: The hardware mode utilized for LLM, with the default set to 'cpu', and the optional selection is 'hpu'"
     exit 1
 fi
 
 # Set the volume variable
 volume=$PWD/data
 
-# Build the Docker run command based on the number of cards
-docker run -it --rm --name="ChatQnA_server" -p $port_number:$port_number --network=host -v $volume:/data -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} vllm:cpu /bin/bash -c "cd / && export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --model $model_name --host 0.0.0.0 --port $port_number"
+# Build the Docker run command based on hardware mode
+if ["$hw_mode" = "hpu"]; then
+    docker run -it --runtime=habana --rm --name="ChatQnA_server" -p $port_number:$port_number -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} vllm:hpu /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $model_name --host 0.0.0.0 --port $port_number"
+else
+    docker run -it --rm --name="ChatQnA_server" -p $port_number:$port_number --network=host -v $volume:/data -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} vllm:cpu /bin/bash -c "cd / && export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --model $model_name --host 0.0.0.0 --port $port_number"
+fi

From 5b3202b5c8c59a751ea39bb56871c331151d4d6e Mon Sep 17 00:00:00 2001
From: tianyil1 <tianyi.liu@intel.com>
Date: Fri, 31 May 2024 16:33:08 +0800
Subject: [PATCH 3/8] refine the vllm readme for gaudi support

Signed-off-by: tianyil1 <tianyi.liu@intel.com>
---
 comps/llms/text-generation/vllm/README.md           | 13 ++++++++-----
 .../text-generation/vllm/launch_vllm_service.sh     |  2 +-
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/comps/llms/text-generation/vllm/README.md b/comps/llms/text-generation/vllm/README.md
index af5343da3..1cbf59d4a 100644
--- a/comps/llms/text-generation/vllm/README.md
+++ b/comps/llms/text-generation/vllm/README.md
@@ -1,10 +1,10 @@
 # vLLM Endpoint Serve
 
-[vLLM](https://github.com/vllm-project/vllm) is a fast and easy-to-use library for LLM inference and serving, it delivers state-of-the-art serving throughput with a set of advanced features such as PagedAttention, Continuous batching and etc.. Besides GPUs, vLLM already supported [Intel CPUs](https://www.intel.com/content/www/us/en/products/overview.html), Gaudi accelerators support will be added soon. This guide provides an example on how to launch vLLM serving endpoint on CPU.
+[vLLM](https://github.com/vllm-project/vllm) is a fast and easy-to-use library for LLM inference and serving, it delivers state-of-the-art serving throughput with a set of advanced features such as PagedAttention, Continuous batching and etc.. Besides GPUs, vLLM already supported [Intel CPUs](https://www.intel.com/content/www/us/en/products/overview.html) and [Gaudi accelerators](https://habana.ai/products). This guide provides an example on how to launch vLLM serving endpoint on CPU and Gaudi accelerators.
 
 ## Getting Started
 
-### Launch vLLM CPU Service
+### Launch vLLM Service
 
 #### Launch a local server instance:
 
@@ -12,6 +12,8 @@
 bash ./serving/vllm/launch_vllm_service.sh
 ```
 
+The `./serving/vllm/launch_vllm_service.sh` accepts one parameter `hw_mode` to specify the hardware mode of the service, with the default being `cpu`, and the optional selection can be `hpu`.
+
 For gated models such as `LLAMA-2`, you will have to pass -e HF_TOKEN=\<token\> to the docker run command above with a valid Hugging Face Hub read token.
 
 Please follow this link [huggingface token](https://huggingface.co/docs/hub/security-tokens) to get the access token and export `HF_TOKEN` environment with the token.
@@ -33,14 +35,15 @@ curl http://127.0.0.1:8080/v1/completions \
   }'
 ```
 
-#### Customize vLLM CPU Service
+#### Customize vLLM Service
 
-The `./serving/vllm/launch_vllm_service.sh` script accepts two parameters:
+The `./serving/vllm/launch_vllm_service.sh` script accepts three parameters:
 
 - port_number: The port number assigned to the vLLM CPU endpoint, with the default being 8080.
 - model_name: The model name utilized for LLM, with the default set to "mistralai/Mistral-7B-v0.1".
+- hw_mode: The hardware mode utilized for LLM, with the default set to "cpu", and the optional selection can be "hpu"
 
-You have the flexibility to customize two parameters according to your specific needs. Additionally, you can set the vLLM CPU endpoint by exporting the environment variable `vLLM_LLM_ENDPOINT`:
+You have the flexibility to customize two parameters according to your specific needs. Additionally, you can set the vLLM endpoint by exporting the environment variable `vLLM_LLM_ENDPOINT`:
 
 ```bash
 export vLLM_LLM_ENDPOINT="http://xxx.xxx.xxx.xxx:8080"
diff --git a/comps/llms/text-generation/vllm/launch_vllm_service.sh b/comps/llms/text-generation/vllm/launch_vllm_service.sh
index 18c2efd3e..880dddb8b 100644
--- a/comps/llms/text-generation/vllm/launch_vllm_service.sh
+++ b/comps/llms/text-generation/vllm/launch_vllm_service.sh
@@ -19,7 +19,7 @@ if [ "$#" -lt 0 ] || [ "$#" -gt 3 ]; then
     echo "Usage: $0 [port_number] [model_name] [hw_mode]"
     echo "port_number: The port number assigned to the vLLM CPU endpoint, with the default being 8080."
     echo "model_name: The model name utilized for LLM, with the default set to 'mistralai/Mistral-7B-v0.1'."
-    echo "hw_mode: The hardware mode utilized for LLM, with the default set to 'cpu', and the optional selection is 'hpu'"
+    echo "hw_mode: The hardware mode utilized for LLM, with the default set to 'cpu', and the optional selection can be 'hpu'"
     exit 1
 fi
 

From 2a7c7298d008bba9e17f1349f69872baa063fe33 Mon Sep 17 00:00:00 2001
From: tianyil1 <tianyi.liu@intel.com>
Date: Thu, 6 Jun 2024 13:48:45 +0800
Subject: [PATCH 4/8] updated the gaudi version with the official version

Signed-off-by: tianyil1 <tianyi.liu@intel.com>
---
 comps/llms/text-generation/vllm/README.md              | 2 +-
 comps/llms/text-generation/vllm/docker/Dockerfile.hpu  | 2 +-
 comps/llms/text-generation/vllm/launch_vllm_service.sh | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/comps/llms/text-generation/vllm/README.md b/comps/llms/text-generation/vllm/README.md
index 1cbf59d4a..18553a0f9 100644
--- a/comps/llms/text-generation/vllm/README.md
+++ b/comps/llms/text-generation/vllm/README.md
@@ -40,7 +40,7 @@ curl http://127.0.0.1:8080/v1/completions \
 The `./serving/vllm/launch_vllm_service.sh` script accepts three parameters:
 
 - port_number: The port number assigned to the vLLM CPU endpoint, with the default being 8080.
-- model_name: The model name utilized for LLM, with the default set to "mistralai/Mistral-7B-v0.1".
+- model_name: The model name utilized for LLM, with the default set to "Intel/neural-chat-7b-v3-3".
 - hw_mode: The hardware mode utilized for LLM, with the default set to "cpu", and the optional selection can be "hpu"
 
 You have the flexibility to customize two parameters according to your specific needs. Additionally, you can set the vLLM endpoint by exporting the environment variable `vLLM_LLM_ENDPOINT`:
diff --git a/comps/llms/text-generation/vllm/docker/Dockerfile.hpu b/comps/llms/text-generation/vllm/docker/Dockerfile.hpu
index d006b4442..430cf4641 100644
--- a/comps/llms/text-generation/vllm/docker/Dockerfile.hpu
+++ b/comps/llms/text-generation/vllm/docker/Dockerfile.hpu
@@ -1,4 +1,4 @@
-FROM vault.habana.ai/gaudi-docker/1.15.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
+FROM vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest
 
 ENV LANG=en_US.UTF-8
 
diff --git a/comps/llms/text-generation/vllm/launch_vllm_service.sh b/comps/llms/text-generation/vllm/launch_vllm_service.sh
index 880dddb8b..08b2895eb 100644
--- a/comps/llms/text-generation/vllm/launch_vllm_service.sh
+++ b/comps/llms/text-generation/vllm/launch_vllm_service.sh
@@ -7,7 +7,7 @@
 # Set default values
 default_port=8080
 default_hw_mode="cpu"
-default_model="mistralai/Mistral-7B-v0.1"
+default_model="Intel/neural-chat-7b-v3-3"
 
 # Assign arguments to variables
 port_number=${1:-$default_port}
@@ -18,7 +18,7 @@ hw_mode=${3:-$default_hw_mode}
 if [ "$#" -lt 0 ] || [ "$#" -gt 3 ]; then
     echo "Usage: $0 [port_number] [model_name] [hw_mode]"
     echo "port_number: The port number assigned to the vLLM CPU endpoint, with the default being 8080."
-    echo "model_name: The model name utilized for LLM, with the default set to 'mistralai/Mistral-7B-v0.1'."
+    echo "model_name: The model name utilized for LLM, with the default set to 'Intel/neural-chat-7b-v3-3'."
     echo "hw_mode: The hardware mode utilized for LLM, with the default set to 'cpu', and the optional selection can be 'hpu'"
     exit 1
 fi
@@ -28,7 +28,7 @@ volume=$PWD/data
 
 # Build the Docker run command based on hardware mode
 if ["$hw_mode" = "hpu"]; then
-    docker run -it --runtime=habana --rm --name="ChatQnA_server" -p $port_number:$port_number -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} vllm:hpu /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $model_name --host 0.0.0.0 --port $port_number"
+    docker run -it --runtime=habana --rm --name="ChatQnA_server" -p $port_number:$port_number -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HF_TOKEN} vllm:hpu /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $model_name --host 0.0.0.0 --port $port_number"
 else
     docker run -it --rm --name="ChatQnA_server" -p $port_number:$port_number --network=host -v $volume:/data -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} vllm:cpu /bin/bash -c "cd / && export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --model $model_name --host 0.0.0.0 --port $port_number"
 fi

From a331e4e39ec3d61dfb1bfc8d27d3b3af99d9da13 Mon Sep 17 00:00:00 2001
From: tianyil1 <tianyi.liu@intel.com>
Date: Thu, 6 Jun 2024 16:05:37 +0800
Subject: [PATCH 5/8] add the host ip to the launch script

Signed-off-by: tianyil1 <tianyi.liu@intel.com>
---
 comps/llms/text-generation/vllm/launch_vllm_service.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/comps/llms/text-generation/vllm/launch_vllm_service.sh b/comps/llms/text-generation/vllm/launch_vllm_service.sh
index 08b2895eb..7e32c8775 100644
--- a/comps/llms/text-generation/vllm/launch_vllm_service.sh
+++ b/comps/llms/text-generation/vllm/launch_vllm_service.sh
@@ -27,8 +27,8 @@ fi
 volume=$PWD/data
 
 # Build the Docker run command based on hardware mode
-if ["$hw_mode" = "hpu"]; then
+if [ "$hw_mode" = "hpu" ]; then
     docker run -it --runtime=habana --rm --name="ChatQnA_server" -p $port_number:$port_number -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HF_TOKEN} vllm:hpu /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $model_name --host 0.0.0.0 --port $port_number"
 else
-    docker run -it --rm --name="ChatQnA_server" -p $port_number:$port_number --network=host -v $volume:/data -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} vllm:cpu /bin/bash -c "cd / && export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --model $model_name --host 0.0.0.0 --port $port_number"
+    docker run -it --rm --name="ChatQnA_server" -p $port_number:$port_number --network=host -v $volume:/data -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HF_TOKEN} vllm:cpu /bin/bash -c "cd / && export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $model_name --host 0.0.0.0 --port $port_number"
 fi

From f08c917b614e0a469a1594d0178c8ac617fab64a Mon Sep 17 00:00:00 2001
From: tianyil1 <tianyi.liu@intel.com>
Date: Thu, 6 Jun 2024 16:10:36 +0800
Subject: [PATCH 6/8] add the vllm gaudi into the main readme

Signed-off-by: tianyil1 <tianyi.liu@intel.com>
---
 README.md | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index a77afa704..51090259a 100644
--- a/README.md
+++ b/README.md
@@ -134,8 +134,8 @@ The initially supported `Microservices` are described in the below table. More `
 			<td>Dataprep on Xeon CPU</td>
 		</tr>
 		<tr>
-			<td rowspan="5"><a href="./comps/llms/README.md">LLM</a></td>
-            <td rowspan="5"><a href="https://www.langchain.com">LangChain</a></td>
+			<td rowspan="6"><a href="./comps/llms/README.md">LLM</a></td>
+            <td rowspan="6"><a href="https://www.langchain.com">LangChain</a></td>
 			<td rowspan="2"><a href="https://huggingface.co/Intel/neural-chat-7b-v3-3">Intel/neural-chat-7b-v3-3</a></td>
 			<td><a href="https://github.com/huggingface/tgi-gaudi">TGI Gaudi</a></td>
 			<td>Gaudi2</td>
@@ -147,7 +147,7 @@ The initially supported `Microservices` are described in the below table. More `
 			<td>LLM on Xeon CPU</td>
 		</tr>
 		<tr>
-			<td rowspan="2"><a href="https://huggingface.co/meta-llama/Llama-2-7b-chat-hf">meta-llama/Llama-2-7b-chat-hf</a></td>
+			<td rowspan="2"><a href="https://huggingface.co/Intel/neural-chat-7b-v3-3">Intel/neural-chat-7b-v3-3</a></td>
 			<td rowspan="2"><a href="https://github.com/ray-project/ray">Ray Serve</a></td>
 			<td>Gaudi2</td>
 			<td>LLM on Gaudi2</td>
@@ -157,8 +157,12 @@ The initially supported `Microservices` are described in the below table. More `
 			<td>LLM on Xeon CPU</td>
 		</tr>
 		<tr>
-			<td><a href="https://huggingface.co/mistralai/Mistral-7B-v0.1">mistralai/Mistral-7B-v0.1</a></td>
-			<td><a href="https://github.com/vllm-project/vllm/">vLLM</a></td>
+			<td rowspan="2"><a href="https://huggingface.co/Intel/neural-chat-7b-v3-3">Intel/neural-chat-7b-v3-3</a></td>
+			<td rowspan="2"><a href="https://github.com/vllm-project/vllm/">vLLM</a></td>
+			<td>Gaudi2</td>
+			<td>LLM on Gaudi2</td>
+		</tr>
+		<tr>
 			<td>Xeon</td>
 			<td>LLM on Xeon CPU</td>
 		</tr>

From ef3eb8f633b802c698d7106fa752e31b601fc98c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 6 Jun 2024 08:12:51 +0000
Subject: [PATCH 7/8] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 comps/llms/text-generation/vllm/build_docker.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/comps/llms/text-generation/vllm/build_docker.sh b/comps/llms/text-generation/vllm/build_docker.sh
index b1d6fba14..3680f076c 100644
--- a/comps/llms/text-generation/vllm/build_docker.sh
+++ b/comps/llms/text-generation/vllm/build_docker.sh
@@ -35,4 +35,4 @@ else
     git clone https://github.com/vllm-project/vllm.git
     cd ./vllm/
     docker build -f Dockerfile.cpu -t vllm:cpu --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
-fi
\ No newline at end of file
+fi

From 6df39120042e821bb14fb5dc1702a0a15adadfae Mon Sep 17 00:00:00 2001
From: tianyil1 <tianyi.liu@intel.com>
Date: Tue, 11 Jun 2024 14:42:38 +0800
Subject: [PATCH 8/8] refine the readme with the default model
 'Intel/neural-chat-7b-v3-3'

Signed-off-by: tianyil1 <tianyi.liu@intel.com>
---
 comps/llms/text-generation/vllm/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/comps/llms/text-generation/vllm/README.md b/comps/llms/text-generation/vllm/README.md
index 18553a0f9..338631552 100644
--- a/comps/llms/text-generation/vllm/README.md
+++ b/comps/llms/text-generation/vllm/README.md
@@ -47,5 +47,5 @@ You have the flexibility to customize two parameters according to your specific
 
 ```bash
 export vLLM_LLM_ENDPOINT="http://xxx.xxx.xxx.xxx:8080"
-export LLM_MODEL=<model_name> # example: export LLM_MODEL="mistralai/Mistral-7B-v0.1"
+export LLM_MODEL=<model_name> # example: export LLM_MODEL="Intel/neural-chat-7b-v3-3"
 ```