From 937a90834e0ce61786576a4c449e95be4404baa6 Mon Sep 17 00:00:00 2001
From: Xinyao Wang <xinyao.wang@intel.com>
Date: Tue, 31 Dec 2024 14:55:01 +0800
Subject: [PATCH 01/23] refactor docsum

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>
---
 .../docker_compose/doc-summarization_tgi.yaml |  51 +++
 .../doc-summarization_tgi_on_intel_hpu.yaml   |  62 ++++
 .../doc-summarization_vllm.yaml               |  54 +++
 .../doc-summarization_vllm_on_intel_hpu.yaml} |  31 +-
 .../doc-summarization}/Dockerfile             |   4 +-
 .../doc-summarization}/README.md              | 105 +++---
 .../doc-summarization}/entrypoint.sh          |   2 +-
 .../integrations}/__init__.py                 |   0
 .../doc-summarization/integrations/opea.py    | 314 ++++++++++++++++++
 .../integrations/template.py                  |  55 +++
 .../opea_docsum_microservice.py               |  84 +++++
 .../requirements-runtime.txt                  |   0
 .../doc-summarization}/requirements.txt       |   0
 .../tgi/langchain/docker_compose_llm.yaml     |  37 ---
 .../summarization/tgi/langchain/entrypoint.sh |   8 -
 comps/llms/summarization/tgi/langchain/llm.py | 245 --------------
 .../summarization/vllm/langchain/Dockerfile   |  28 --
 .../summarization/vllm/langchain/README.md    | 171 ----------
 .../summarization/vllm/langchain/__init__.py  |   2 -
 .../llms/summarization/vllm/langchain/llm.py  | 247 --------------
 .../vllm/langchain/requirements-runtime.txt   |   1 -
 .../vllm/langchain/requirements.txt           |  16 -
 22 files changed, 701 insertions(+), 816 deletions(-)
 create mode 100644 comps/llms/deployment/docker_compose/doc-summarization_tgi.yaml
 create mode 100644 comps/llms/deployment/docker_compose/doc-summarization_tgi_on_intel_hpu.yaml
 create mode 100644 comps/llms/deployment/docker_compose/doc-summarization_vllm.yaml
 rename comps/llms/{summarization/vllm/langchain/docker_compose_llm.yaml => deployment/docker_compose/doc-summarization_vllm_on_intel_hpu.yaml} (53%)
 rename comps/llms/{summarization/tgi/langchain => src/doc-summarization}/Dockerfile (80%)
 rename comps/llms/{summarization/tgi/langchain => src/doc-summarization}/README.md (71%)
 rename comps/llms/{summarization/vllm/langchain => src/doc-summarization}/entrypoint.sh (81%)
 rename comps/llms/{summarization/tgi/langchain => src/doc-summarization/integrations}/__init__.py (100%)
 create mode 100644 comps/llms/src/doc-summarization/integrations/opea.py
 create mode 100644 comps/llms/src/doc-summarization/integrations/template.py
 create mode 100644 comps/llms/src/doc-summarization/opea_docsum_microservice.py
 rename comps/llms/{summarization/tgi/langchain => src/doc-summarization}/requirements-runtime.txt (100%)
 rename comps/llms/{summarization/tgi/langchain => src/doc-summarization}/requirements.txt (100%)
 delete mode 100644 comps/llms/summarization/tgi/langchain/docker_compose_llm.yaml
 delete mode 100644 comps/llms/summarization/tgi/langchain/entrypoint.sh
 delete mode 100644 comps/llms/summarization/tgi/langchain/llm.py
 delete mode 100644 comps/llms/summarization/vllm/langchain/Dockerfile
 delete mode 100644 comps/llms/summarization/vllm/langchain/README.md
 delete mode 100644 comps/llms/summarization/vllm/langchain/__init__.py
 delete mode 100644 comps/llms/summarization/vllm/langchain/llm.py
 delete mode 100644 comps/llms/summarization/vllm/langchain/requirements-runtime.txt
 delete mode 100644 comps/llms/summarization/vllm/langchain/requirements.txt

diff --git a/comps/llms/deployment/docker_compose/doc-summarization_tgi.yaml b/comps/llms/deployment/docker_compose/doc-summarization_tgi.yaml
new file mode 100644
index 0000000000..af1caf6fb0
--- /dev/null
+++ b/comps/llms/deployment/docker_compose/doc-summarization_tgi.yaml
@@ -0,0 +1,51 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+version: "3.8"
+
+services:
+    tgi-service:
+    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+    container_name: tgi-server
+    ports:
+      - ${LLM_ENDPOINT_PORT:-8008}:80
+    volumes:
+      - "./data:/data"
+    shm_size: 1g
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      host_ip: ${host_ip}
+      LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"]
+      interval: 10s
+      timeout: 10s
+      retries: 100
+    command: --model-id ${LLM_MODEL_ID} --max-input-length ${MAX_INPUT_TOKENS} --max-total-tokens ${MAX_TOTAL_TOKENS}
+  llm:
+    image: opea/llm-docsum:latest
+    container_name: llm-docsum-server
+    depends_on:
+      tgi-service:
+        condition: service_healthy
+    ports:
+      - ${DOCSUM_PORT:-9000}:9000
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS}
+      MAX_TOTAL_TOKENS: ${MAX_TOTAL_TOKENS}
+      LLM_ENDPOINT: ${LLM_ENDPOINT}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      LLM_BACKEND: ${LLM_BACKEND}
+    restart: unless-stopped
+
+networks:
+  default:
+    driver: bridge
diff --git a/comps/llms/deployment/docker_compose/doc-summarization_tgi_on_intel_hpu.yaml b/comps/llms/deployment/docker_compose/doc-summarization_tgi_on_intel_hpu.yaml
new file mode 100644
index 0000000000..7afe3fda82
--- /dev/null
+++ b/comps/llms/deployment/docker_compose/doc-summarization_tgi_on_intel_hpu.yaml
@@ -0,0 +1,62 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+version: "3.8"
+
+services:
+  tgi-service:
+    image: ghcr.io/huggingface/tgi-gaudi:2.3.1
+    container_name: tgi-gaudi-server
+    ports:
+      - ${LLM_ENDPOINT_PORT:-8008}:80
+    volumes:
+      - "./data:/data"
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HF_HUB_DISABLE_PROGRESS_BARS: 1
+      HF_HUB_ENABLE_HF_TRANSFER: 0
+      HABANA_VISIBLE_DEVICES: all
+      OMPI_MCA_btl_vader_single_copy_mechanism: none
+      ENABLE_HPU_GRAPH: true
+      LIMIT_HPU_GRAPH: true
+      USE_FLASH_ATTENTION: true
+      FLASH_ATTENTION_RECOMPUTE: true
+      host_ip: ${host_ip}
+      LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
+    runtime: habana
+    cap_add:
+      - SYS_NICE
+    ipc: host
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"]
+      interval: 10s
+      timeout: 10s
+      retries: 100
+    command: --model-id ${LLM_MODEL_ID} --max-input-length ${MAX_INPUT_TOKENS} --max-total-tokens ${MAX_TOTAL_TOKENS}
+  llm:
+    image: opea/llm-docsum:latest
+    container_name: llm-docsum-server
+    depends_on:
+      tgi-service:
+        condition: service_healthy
+    ports:
+      - ${DOCSUM_PORT:-9000}:9000
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS}
+      MAX_TOTAL_TOKENS: ${MAX_TOTAL_TOKENS}
+      LLM_ENDPOINT: ${LLM_ENDPOINT}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      LLM_BACKEND: ${LLM_BACKEND}
+    restart: unless-stopped
+
+networks:
+  default:
+    driver: bridge
diff --git a/comps/llms/deployment/docker_compose/doc-summarization_vllm.yaml b/comps/llms/deployment/docker_compose/doc-summarization_vllm.yaml
new file mode 100644
index 0000000000..9671fff53b
--- /dev/null
+++ b/comps/llms/deployment/docker_compose/doc-summarization_vllm.yaml
@@ -0,0 +1,54 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+version: "3.8"
+
+services:
+  vllm-service:
+    image: opea/vllm:latest
+    container_name: vllm-server
+    ports:
+      - ${LLM_ENDPOINT_PORT:-8008}:80
+    volumes:
+      - "./data:/data"
+    shm_size: 128g
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      VLLM_TORCH_PROFILER_DIR: "/mnt"
+      host_ip: ${host_ip}
+      LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
+      VLLM_SKIP_WARMUP: ${VLLM_SKIP_WARMUP:-false}
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"]
+      interval: 10s
+      timeout: 10s
+      retries: 100
+    command: --model $LLM_MODEL_ID --host 0.0.0.0 --port 80
+  llm:
+    image: opea/llm-docsum:latest
+    container_name: llm-docsum-server
+    depends_on:
+      vllm-service:
+        condition: service_healthy
+    ports:
+      - ${DOCSUM_PORT:-9000}:9000
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS}
+      MAX_TOTAL_TOKENS: ${MAX_TOTAL_TOKENS}
+      LLM_ENDPOINT: ${LLM_ENDPOINT}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      LLM_BACKEND: ${LLM_BACKEND}
+    restart: unless-stopped
+
+networks:
+  default:
+    driver: bridge
diff --git a/comps/llms/summarization/vllm/langchain/docker_compose_llm.yaml b/comps/llms/deployment/docker_compose/doc-summarization_vllm_on_intel_hpu.yaml
similarity index 53%
rename from comps/llms/summarization/vllm/langchain/docker_compose_llm.yaml
rename to comps/llms/deployment/docker_compose/doc-summarization_vllm_on_intel_hpu.yaml
index 26847387cc..4f4d836b84 100644
--- a/comps/llms/summarization/vllm/langchain/docker_compose_llm.yaml
+++ b/comps/llms/deployment/docker_compose/doc-summarization_vllm_on_intel_hpu.yaml
@@ -8,37 +8,50 @@ services:
     image: opea/vllm-gaudi:latest
     container_name: vllm-gaudi-server
     ports:
-      - "8008:80"
+      - ${LLM_ENDPOINT_PORT:-8008}:80
     volumes:
       - "./data:/data"
     environment:
       no_proxy: ${no_proxy}
       http_proxy: ${http_proxy}
       https_proxy: ${https_proxy}
-      HF_TOKEN: ${HF_TOKEN}
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
       HABANA_VISIBLE_DEVICES: all
       OMPI_MCA_btl_vader_single_copy_mechanism: none
       LLM_MODEL_ID: ${LLM_MODEL_ID}
+      VLLM_TORCH_PROFILER_DIR: "/mnt"
+      host_ip: ${host_ip}
+      LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
+      VLLM_SKIP_WARMUP: ${VLLM_SKIP_WARMUP:-false}
     runtime: habana
     cap_add:
       - SYS_NICE
     ipc: host
-    command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"]
+      interval: 10s
+      timeout: 10s
+      retries: 100
+    command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
   llm:
-    image: opea/llm-docsum-vllm:latest
-    container_name: llm-docsum-vllm-server
+    image: opea/llm-docsum:latest
+    container_name: llm-docsum-server
+    depends_on:
+      vllm-service:
+        condition: service_healthy
     ports:
-      - "9000:9000"
+      - ${DOCSUM_PORT:-9000}:9000
     ipc: host
     environment:
       no_proxy: ${no_proxy}
       http_proxy: ${http_proxy}
       https_proxy: ${https_proxy}
-      vLLM_ENDPOINT: ${vLLM_ENDPOINT}
-      HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
-      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
       MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS}
       MAX_TOTAL_TOKENS: ${MAX_TOTAL_TOKENS}
+      LLM_ENDPOINT: ${LLM_ENDPOINT}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      LLM_BACKEND: ${LLM_BACKEND}
     restart: unless-stopped
 
 networks:
diff --git a/comps/llms/summarization/tgi/langchain/Dockerfile b/comps/llms/src/doc-summarization/Dockerfile
similarity index 80%
rename from comps/llms/summarization/tgi/langchain/Dockerfile
rename to comps/llms/src/doc-summarization/Dockerfile
index 3a73120547..a7c07df449 100644
--- a/comps/llms/summarization/tgi/langchain/Dockerfile
+++ b/comps/llms/src/doc-summarization/Dockerfile
@@ -19,10 +19,10 @@ COPY comps /home/user/comps
 
 RUN pip install --no-cache-dir --upgrade pip setuptools && \
     if [ ${ARCH} = "cpu" ]; then pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu; fi && \
-    pip install --no-cache-dir -r /home/user/comps/llms/summarization/tgi/langchain/requirements.txt
+    pip install --no-cache-dir -r /home/user/comps/llms/src/doc-summarization/requirements.txt
 
 ENV PYTHONPATH=$PYTHONPATH:/home/user
 
-WORKDIR /home/user/comps/llms/summarization/tgi/langchain
+WORKDIR /home/user/comps/llms/src/doc-summarization
 
 ENTRYPOINT ["bash", "entrypoint.sh"]
diff --git a/comps/llms/summarization/tgi/langchain/README.md b/comps/llms/src/doc-summarization/README.md
similarity index 71%
rename from comps/llms/summarization/tgi/langchain/README.md
rename to comps/llms/src/doc-summarization/README.md
index 888b4adce1..4dd8b664e6 100644
--- a/comps/llms/summarization/tgi/langchain/README.md
+++ b/comps/llms/src/doc-summarization/README.md
@@ -1,65 +1,43 @@
-# Document Summary TGI Microservice
+# Document Summary LLM Microservice
 
-This microservice leverages LangChain to implement summarization strategies and facilitate LLM inference using Text Generation Inference on Intel Xeon and Gaudi2 processors.
-[Text Generation Inference](https://github.com/huggingface/text-generation-inference) (TGI) is a toolkit for deploying and serving Large Language Models (LLMs). TGI enables high-performance text generation for the most popular open-source LLMs, including Llama, Falcon, StarCoder, BLOOM, GPT-NeoX, and more.
+This microservice leverages LangChain to implement summarization strategies and facilitate LLM inference using Text Generation Inference on Intel Xeon and Gaudi2 processors. You can set backend service either [TGI](../../../3rd_parties/tgi) or [vLLM](../../../3rd_parties/vllm).
 
-## 🚀1. Start Microservice with Python 🐍 (Option 1)
+## 🚀1. Start Microservice with Docker 🐳
 
-To start the LLM microservice, you need to install python packages first.
+### 1.1 Setup Environment Variables
 
-### 1.1 Install Requirements
+In order to start DocSum services, you need to setup the following environment variables first.
 
 ```bash
-pip install -r requirements.txt
-```
-
-### 1.2 Start LLM Service
-
-```bash
-export HF_TOKEN=${your_hf_api_token}
-docker run -p 8008:80 -v ./data:/data --name llm-docsum-tgi --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.1.0 --model-id ${your_hf_llm_model}
-```
-
-### 1.3 Verify the TGI Service
-
-```bash
-curl http://${your_ip}:8008/v1/chat/completions \
-     -X POST \
-     -d '{"model": ${your_hf_llm_model}, "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \
-     -H 'Content-Type: application/json'
+export host_ip=${your_host_ip}
+export LLM_ENDPOINT_PORT=8008
+export FAQ_PORT=9000
+export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
+export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}"
+export LLM_MODEL_ID=${your_hf_llm_model}
+export MAX_INPUT_TOKENS=2048
+export MAX_TOTAL_TOKENS=4096
+export LLM_BACKEND="tgi" # or "vllm"
 ```
 
-### 1.4 Start LLM Service with Python Script
+Please make sure MAX_TOTAL_TOKENS should be larger than (MAX_INPUT_TOKENS + max_new_tokens + 50), 50 is reserved prompt length.
 
-```bash
-export TGI_LLM_ENDPOINT="http://${your_ip}:8008"
-python llm.py
-```
+### 1.2 Build Docker Image
 
-## 🚀2. Start Microservice with Docker 🐳 (Option 2)
+Step 1: Prepare backend LLM docker image.
 
-If you start an LLM microservice with docker, the `docker_compose_llm.yaml` file will automatically start a TGI/vLLM service with docker.
+If you want to use vLLM backend, refer to [vLLM](../../../3rd_parties/vllm/src) to build vLLM docker images first.
 
-### 2.1 Setup Environment Variables
+No need for TGI.
 
-In order to start TGI and LLM services, you need to setup the following environment variables first.
+Step 2: Build FaqGen docker image.
 
 ```bash
-export HF_TOKEN=${your_hf_api_token}
-export TGI_LLM_ENDPOINT="http://${your_ip}:8008"
-export LLM_MODEL_ID=${your_hf_llm_model}
-export MAX_INPUT_TOKENS=2048
-export MAX_TOTAL_TOKENS=4096
+cd ../../../../
+docker build -t opea/llm-docsum:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/summarization/Dockerfile .
 ```
 
-Please make sure MAX_TOTAL_TOKENS should be larger than (MAX_INPUT_TOKENS + max_new_tokens + 50), 50 is reserved prompt length.
-
-### 2.2 Build Docker Image
-
-```bash
-cd ../../../../../
-docker build -t opea/llm-docsum-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/summarization/tgi/langchain/Dockerfile .
-```
+### 1.3 Run Docker
 
 To start a docker container, you have two options:
 
@@ -68,16 +46,45 @@ To start a docker container, you have two options:
 
 You can choose one as needed.
 
-### 2.3 Run Docker with CLI (Option A)
+### 1.3.1 Run Docker with CLI (Option A)
+
+Step 1: Start the backend LLM service
+Please refer to [TGI](../../../3rd_parties/tgi/deployment/docker_compose/) or [vLLM](<(../../../3rd_parties/vllm/deployment/docker_compose/)>) guideline to start a backend LLM service.
+
+Step 2: Start the DocSum microservices
 
 ```bash
-docker run -d --name="llm-docsum-tgi-server" -p 9000:9000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e TGI_LLM_ENDPOINT=$TGI_LLM_ENDPOINT -e HF_TOKEN=$HF_TOKEN -e MAX_INPUT_TOKENS=${MAX_INPUT_TOKENS} -e MAX_TOTAL_TOKENS=${MAX_TOTAL_TOKENS} opea/llm-docsum-tgi:latest
+docker run -d \
+    --name="llm-docsum-server" \
+    -p 9000:9000 \
+    --ipc=host \
+    -e http_proxy=$http_proxy \
+    -e https_proxy=$https_proxy \
+    -e LLM_MODEL_ID=$LLM_MODEL_ID \
+    -e LLM_ENDPOINT=$LLM_ENDPOINT \
+    -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN \
+    -e LLM_BACKEND=$LLM_BACKEND \
+    -e MAX_INPUT_TOKENS=${MAX_INPUT_TOKENS} \
+    -e MAX_TOTAL_TOKENS=${MAX_TOTAL_TOKENS} \
+    opea/llm-docsum:latest
 ```
 
-### 2.4 Run Docker with Docker Compose (Option B)
+### 1.3.2 Run Docker with Docker Compose (Option B)
 
 ```bash
-docker compose -f docker_compose_llm.yaml up -d
+cd ../../deployment/docker_compose/
+
+# Backend is TGI on xeon
+docker compose -f doc-summarization_tgi.yaml up -d
+
+# Backend is TGI on gaudi
+# docker compose -f doc-summarization_tgi_on_intel_hpu.yaml up -d
+
+# Backend is vLLM on xeon
+# docker compose -f doc-summarization_vllm.yaml up -d
+
+# Backend is vLLM on gaudi
+# docker compose -f doc-summarization_vllm_on_intel_hpu.yaml up -d
 ```
 
 ## 🚀3. Consume LLM Service
diff --git a/comps/llms/summarization/vllm/langchain/entrypoint.sh b/comps/llms/src/doc-summarization/entrypoint.sh
similarity index 81%
rename from comps/llms/summarization/vllm/langchain/entrypoint.sh
rename to comps/llms/src/doc-summarization/entrypoint.sh
index d60eddd36b..64c8df3b4d 100644
--- a/comps/llms/summarization/vllm/langchain/entrypoint.sh
+++ b/comps/llms/src/doc-summarization/entrypoint.sh
@@ -5,4 +5,4 @@
 
 pip --no-cache-dir install -r requirements-runtime.txt
 
-python llm.py
+python opea_docsum_microservice.py
diff --git a/comps/llms/summarization/tgi/langchain/__init__.py b/comps/llms/src/doc-summarization/integrations/__init__.py
similarity index 100%
rename from comps/llms/summarization/tgi/langchain/__init__.py
rename to comps/llms/src/doc-summarization/integrations/__init__.py
diff --git a/comps/llms/src/doc-summarization/integrations/opea.py b/comps/llms/src/doc-summarization/integrations/opea.py
new file mode 100644
index 0000000000..08beb619d1
--- /dev/null
+++ b/comps/llms/src/doc-summarization/integrations/opea.py
@@ -0,0 +1,314 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import requests
+from fastapi.responses import StreamingResponse
+from langchain.chains.summarize import load_summarize_chain
+from langchain.docstore.document import Document
+from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
+from langchain_community.llms import HuggingFaceEndpoint, VLLMOpenAI
+from langchain_core.prompts import PromptTemplate
+from transformers import AutoTokenizer
+from .template import templ_en, templ_zh, templ_refine_en, templ_refine_zh
+
+from comps import CustomLogger, GeneratedDoc, DocSumLLMParams, OpeaComponent, ServiceType
+from comps.cores.mega.utils import ConfigError, get_access_token, load_model_configs
+
+logger = CustomLogger("llm_docsum")
+logflag = os.getenv("LOGFLAG", False)
+
+# Environment variables
+MODEL_NAME = os.getenv("LLM_MODEL_ID")
+MODEL_CONFIGS = os.getenv("MODEL_CONFIGS")
+TOKEN_URL = os.getenv("TOKEN_URL")
+CLIENTID = os.getenv("CLIENTID")
+CLIENT_SECRET = os.getenv("CLIENT_SECRET")
+MAX_INPUT_TOKENS = int(os.getenv("MAX_INPUT_TOKENS", 2048))
+MAX_TOTAL_TOKENS = int(os.getenv("MAX_TOTAL_TOKENS", 4096))
+
+if os.getenv("LLM_ENDPOINT") is not None:
+    DEFAULT_ENDPOINT = os.getenv("LLM_ENDPOINT")
+elif os.getenv("TGI_LLM_ENDPOINT") is not None:
+    DEFAULT_ENDPOINT = os.getenv("TGI_LLM_ENDPOINT")
+elif os.getenv("vLLM_ENDPOINT") is not None:
+    DEFAULT_ENDPOINT = os.getenv("vLLM_ENDPOINT")
+else:
+    DEFAULT_ENDPOINT = "http://localhost:8080"
+
+# Validate and Load the models config if MODEL_CONFIGS is not null
+configs_map = {}
+if MODEL_CONFIGS:
+    try:
+        configs_map = load_model_configs(MODEL_CONFIGS)
+    except ConfigError as e:
+        logger.error(f"Failed to load model configurations: {e}")
+        raise ConfigError(f"Failed to load model configurations: {e}")
+
+
+def get_llm_endpoint():
+    if not MODEL_CONFIGS:
+        return DEFAULT_ENDPOINT
+    try:
+        return configs_map.get(MODEL_NAME).get("endpoint")
+    except ConfigError as e:
+        logger.error(f"Input model {MODEL_NAME} not present in model_configs. Error {e}")
+        raise ConfigError(f"Input model {MODEL_NAME} not present in model_configs")
+
+class OPEADocSum(OpeaComponent):
+    """A specialized OPEA DocSum component derived from OpeaComponent.
+
+    Attributes:
+        client (TGI/vLLM): An instance of the TGI/vLLM client for text generation.
+    """
+
+    def __init__(self, name: str, description: str, config: dict = None):
+        super().__init__(name, ServiceType.LLM.name.lower(), description, config)
+        self.access_token = (
+            get_access_token(TOKEN_URL, CLIENTID, CLIENT_SECRET) if TOKEN_URL and CLIENTID and CLIENT_SECRET else None
+        )
+        self.llm_endpoint = get_llm_endpoint()
+        self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+
+    async def generate(self, input: DocSumLLMParams, client):
+        """Invokes the TGI/vLLM LLM service to generate summarization for the provided input.
+
+        Args:
+            input (DocSumLLMParams): The input text(s).
+            client: TGI/vLLM based client
+        """
+        ### check summary type
+        summary_types = ["auto", "stuff", "truncate", "map_reduce", "refine"]
+        if input.summary_type not in summary_types:
+            raise NotImplementedError(f"Please specify the summary_type in {summary_types}")
+        if input.summary_type == "auto":  ### Check input token length in auto mode
+            token_len = len(self.tokenizer.encode(input.query))
+            if token_len > MAX_INPUT_TOKENS + 50:
+                input.summary_type = "refine"
+                if logflag:
+                    logger.info(
+                        f"Input token length {token_len} exceed MAX_INPUT_TOKENS + 50 {MAX_INPUT_TOKENS+50}, auto switch to 'refine' mode."
+                    )
+            else:
+                input.summary_type = "stuff"
+                if logflag:
+                    logger.info(
+                        f"Input token length {token_len} not exceed MAX_INPUT_TOKENS + 50 {MAX_INPUT_TOKENS+50}, auto switch to 'stuff' mode."
+                    )
+
+        ### Check input language
+        if input.language in ["en", "auto"]:
+            templ = templ_en
+            templ_refine = templ_refine_en
+        elif input.language in ["zh"]:
+            templ = templ_zh
+            templ_refine = templ_refine_zh
+        else:
+            raise NotImplementedError('Please specify the input language in "en", "zh", "auto"')
+
+        ## Prompt
+        PROMPT = PromptTemplate.from_template(templ)
+        if input.summary_type == "refine":
+            PROMPT_REFINE = PromptTemplate.from_template(templ_refine)
+        if logflag:
+            logger.info("After prompting:")
+            logger.info(PROMPT)
+            if input.summary_type == "refine":
+                logger.info(PROMPT_REFINE)
+
+        ## Split text
+        if input.summary_type == "stuff":
+            text_splitter = CharacterTextSplitter()
+        else:
+            if input.summary_type == "refine":
+                if MAX_TOTAL_TOKENS <= 2 * input.max_tokens + 128: ## 128 is reserved prompt lenght
+                    raise RuntimeError("In Refine mode, Please set MAX_TOTAL_TOKENS larger than (max_tokens * 2 + 128)")
+                max_input_tokens = min(
+                    MAX_TOTAL_TOKENS - 2 * input.max_tokens - 128, MAX_INPUT_TOKENS
+                ) 
+            else:
+                if MAX_TOTAL_TOKENS <= input.max_tokens + 50: # 50 is reserved token length for prompt
+                    raise RuntimeError("Please set MAX_TOTAL_TOKENS larger than max_tokens + 50)")
+                max_input_tokens = min(
+                    MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS
+                )  
+            chunk_size = min(input.chunk_size, max_input_tokens) if input.chunk_size > 0 else max_input_tokens
+            chunk_overlap = input.chunk_overlap if input.chunk_overlap > 0 else int(0.1 * chunk_size)
+            text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
+                tokenizer=self.tokenizer, chunk_size=chunk_size, chunk_overlap=chunk_overlap
+            )
+            if logflag:
+                logger.info(f"set chunk size to: {chunk_size}")
+                logger.info(f"set chunk overlap to: {chunk_overlap}")
+
+        texts = text_splitter.split_text(input.query)
+        docs = [Document(page_content=t) for t in texts]
+        if logflag:
+            logger.info(f"Split input query into {len(docs)} chunks")
+            logger.info(f"The character length of the first chunk is {len(texts[0])}")
+
+        ## LLM chain
+        summary_type = input.summary_type
+        if summary_type == "stuff":
+            llm_chain = load_summarize_chain(llm=client, prompt=PROMPT)
+        elif summary_type == "truncate":
+            docs = [docs[0]]
+            llm_chain = load_summarize_chain(llm=client, prompt=PROMPT)
+        elif summary_type == "map_reduce":
+            llm_chain = load_summarize_chain(
+                llm=client, map_prompt=PROMPT, combine_prompt=PROMPT, chain_type="map_reduce", return_intermediate_steps=True
+            )
+        elif summary_type == "refine":
+            llm_chain = load_summarize_chain(
+                llm=client,
+                question_prompt=PROMPT,
+                refine_prompt=PROMPT_REFINE,
+                chain_type="refine",
+                return_intermediate_steps=True,
+            )
+        else:
+            raise NotImplementedError(f'Please specify the summary_type in {summary_types}')
+
+        if input.streaming:
+
+            async def stream_generator():
+                from langserve.serialization import WellKnownLCSerializer
+
+                _serializer = WellKnownLCSerializer()
+                async for chunk in llm_chain.astream_log(docs):
+                    data = _serializer.dumps({"ops": chunk.ops}).decode("utf-8")
+                    if logflag:
+                        logger.info(data)
+                    yield f"data: {data}\n\n"
+                yield "data: [DONE]\n\n"
+
+            return StreamingResponse(stream_generator(), media_type="text/event-stream")
+        else:
+            response = await llm_chain.ainvoke(docs)
+
+            if input.summary_type in ["map_reduce", "refine"]:
+                intermediate_steps = response["intermediate_steps"]
+                if logflag:
+                    logger.info("intermediate_steps:")
+                    logger.info(intermediate_steps)
+
+            output_text = response["output_text"]
+            if logflag:
+                logger.info("\n\noutput_text:")
+                logger.info(output_text)
+
+            return GeneratedDoc(text=output_text, prompt=input.query)
+
+class OPEADocSum_TGI(OPEADocSum):
+    """A specialized OPEA DocSum TGI component derived from OPEADocSum for interacting with TGI services based on Lanchain HuggingFaceEndpoint API.
+
+    Attributes:
+        client (TGI): An instance of the TGI client for text generation.
+    """
+
+    def check_health(self) -> bool:
+        """Checks the health of the TGI LLM service.
+
+        Returns:
+            bool: True if the service is reachable and healthy, False otherwise.
+        """
+
+        try:
+            # response = requests.get(f"{self.llm_endpoint}/health")
+
+            # Will remove after TGI gaudi fix health bug
+            url = f"{self.llm_endpoint}/generate"
+            data = {"inputs": "What is Deep Learning?", "parameters": {"max_new_tokens": 17}}
+            headers = {"Content-Type": "application/json"}
+            response = requests.post(url=url, json=data, headers=headers)
+
+            if response.status_code == 200:
+                return True
+            else:
+                return False
+        except Exception as e:
+            logger.error(e)
+            logger.error("Health check failed")
+            return False
+
+    async def invoke(self, input: DocSumLLMParams):
+        """Invokes the TGI LLM service to generate summarization output for the provided input.
+
+        Args:
+            input (DocSumLLMParams): The input text(s).
+        """
+        server_kwargs = {}
+        if self.access_token:
+            server_kwargs["headers"] = {"Authorization": f"Bearer {self.access_token}"}
+
+        if input.streaming and input.summary_type == "map_reduce":
+            logger.info("Map Reduce mode don't support streaming=True, set to streaming=False")
+            input.streaming = False
+        self.client = HuggingFaceEndpoint(
+            endpoint_url=self.llm_endpoint,
+            max_new_tokens=input.max_tokens,
+            top_k=input.top_k,
+            top_p=input.top_p,
+            typical_p=input.typical_p,
+            temperature=input.temperature,
+            repetition_penalty=input.repetition_penalty,
+            streaming=input.streaming,
+            server_kwargs=server_kwargs,
+        )
+        result = await self.generate(input, self.client)
+
+        return result
+
+
+class OPEADocSum_vLLM(OPEADocSum):
+    """A specialized OPEA DocSum vLLM component derived from OPEADocSum for interacting with vLLM services based on Lanchain VLLMOpenAI API.
+
+    Attributes:
+        client (vLLM): An instance of the vLLM client for text generation.
+    """
+
+    def check_health(self) -> bool:
+        """Checks the health of the vLLM LLM service.
+
+        Returns:
+            bool: True if the service is reachable and healthy, False otherwise.
+        """
+
+        try:
+            response = requests.get(f"{self.llm_endpoint}/health")
+            if response.status_code == 200:
+                return True
+            else:
+                return False
+        except Exception as e:
+            logger.error(e)
+            logger.error("Health check failed")
+            return False
+
+    async def invoke(self, input: DocSumLLMParams):
+        """Invokes the vLLM LLM service to generate summarization output for the provided input.
+
+        Args:
+            input (DocSumLLMParams): The input text(s).
+        """
+        headers = {}
+        if self.access_token:
+            headers = {"Authorization": f"Bearer {self.access_token}"}
+
+        if input.streaming and input.summary_type == "map_reduce":
+            logger.info("Map Reduce mode don't support streaming=True, set to streaming=False")
+            input.streaming = False
+        self.client = VLLMOpenAI(
+            openai_api_key="EMPTY",
+            openai_api_base=self.llm_endpoint + "/v1",
+            model_name=MODEL_NAME,
+            default_headers=headers,
+            max_tokens=input.max_tokens,
+            top_p=input.top_p,
+            streaming=input.streaming,
+            temperature=input.temperature,
+            presence_penalty=input.repetition_penalty
+        )
+        result = await self.generate(input, self.client)
+
+        return result
diff --git a/comps/llms/src/doc-summarization/integrations/template.py b/comps/llms/src/doc-summarization/integrations/template.py
new file mode 100644
index 0000000000..26596f55a0
--- /dev/null
+++ b/comps/llms/src/doc-summarization/integrations/template.py
@@ -0,0 +1,55 @@
+templ_en = """Write a concise summary of the following:
+
+
+"{text}"
+
+
+CONCISE SUMMARY:"""
+
+templ_zh = """请简要概括以下内容:
+
+
+"{text}"
+
+
+概况:"""
+
+
+templ_refine_en = """Your job is to produce a final summary.
+We have provided an existing summary up to a certain point, then we will provide more context.
+You need to refine the existing summary (only if needed) with new context and generate a final summary.
+
+
+Existing Summary:
+"{existing_answer}"
+
+
+
+New Context:
+"{text}"
+
+
+
+Final Summary:
+
+"""
+
+templ_refine_zh = """\
+你的任务是生成一个最终摘要。
+我们已经处理好部分文本并生成初始摘要, 并提供了新的未处理文本
+你需要根据新提供的文本，结合初始摘要，生成一个最终摘要。
+
+
+初始摘要:
+"{existing_answer}"
+
+
+
+新的文本:
+"{text}"
+
+
+
+最终摘要:
+
+"""
\ No newline at end of file
diff --git a/comps/llms/src/doc-summarization/opea_docsum_microservice.py b/comps/llms/src/doc-summarization/opea_docsum_microservice.py
new file mode 100644
index 0000000000..8b45b5f674
--- /dev/null
+++ b/comps/llms/src/doc-summarization/opea_docsum_microservice.py
@@ -0,0 +1,84 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import time
+
+from integrations.opea import OPEADocSum_TGI, OPEADocSum_vLLM
+
+from comps import (
+    CustomLogger,
+    DocSumLLMParams,
+    OpeaComponentController,
+    ServiceType,
+    opea_microservices,
+    register_microservice,
+    register_statistics,
+    statistics_dict,
+)
+
+logger = CustomLogger("llm_docsum")
+logflag = os.getenv("LOGFLAG", False)
+
+llm_backend = os.getenv("LLM_BACKEND", "").lower()
+if logflag:
+    logger.info(f"LLM BACKEND: {llm_backend}")
+
+comps_name = {"tgi": "OPEADocSum_TGI", "vllm": "OPEADocSum_vLLM"}
+active_comps_name = comps_name[llm_backend] if llm_backend != "" else ""
+
+# Initialize OpeaComponentController
+controller = OpeaComponentController()
+
+# Register components
+try:
+    opea_docsum_tgi = OPEADocSum_TGI(
+        name=comps_name["tgi"],
+        description="OPEA DocSum Service",
+    )
+    # Register components with the controller
+    controller.register(opea_docsum_tgi)
+
+    opea_docsum_vllm = OPEADocSum_vLLM(
+        name=comps_name["vllm"],
+        description="OPEA DocSum Service",
+    )
+    # Register components with the controller
+    controller.register(opea_docsum_vllm)
+
+    # Discover and activate a healthy component
+    controller.discover_and_activate(active_comps_name)
+except Exception as e:
+    logger.error(f"Failed to initialize components: {e}")
+
+
+@register_microservice(
+    name="opea_service@llm_docsum",
+    service_type=ServiceType.LLM,
+    endpoint="/v1/docsum",
+    host="0.0.0.0",
+    port=9000,
+)
+@register_statistics(names=["opea_service@llm_docsum"])
+async def llm_generate(input: DocSumLLMParams):
+    start = time.time()
+
+    # Log the input if logging is enabled
+    if logflag:
+        logger.info(input)
+
+    try:
+        # Use the controller to invoke the active component
+        response = await controller.invoke(input)
+        # Record statistics
+        statistics_dict["opea_service@llm_docsum"].append_latency(time.time() - start, None)
+        return response
+
+    except Exception as e:
+        logger.error(f"Error during DocSum invocation: {e}")
+        raise
+
+
+if __name__ == "__main__":
+    logger.info("OPEA DocSum Microservice is starting...")
+    opea_microservices["opea_service@llm_docsum"].start()
diff --git a/comps/llms/summarization/tgi/langchain/requirements-runtime.txt b/comps/llms/src/doc-summarization/requirements-runtime.txt
similarity index 100%
rename from comps/llms/summarization/tgi/langchain/requirements-runtime.txt
rename to comps/llms/src/doc-summarization/requirements-runtime.txt
diff --git a/comps/llms/summarization/tgi/langchain/requirements.txt b/comps/llms/src/doc-summarization/requirements.txt
similarity index 100%
rename from comps/llms/summarization/tgi/langchain/requirements.txt
rename to comps/llms/src/doc-summarization/requirements.txt
diff --git a/comps/llms/summarization/tgi/langchain/docker_compose_llm.yaml b/comps/llms/summarization/tgi/langchain/docker_compose_llm.yaml
deleted file mode 100644
index 93579a5712..0000000000
--- a/comps/llms/summarization/tgi/langchain/docker_compose_llm.yaml
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-version: "3.8"
-
-services:
-  tgi_service:
-    image: ghcr.io/huggingface/text-generation-inference:2.1.0
-    container_name: tgi-service
-    ports:
-      - "8008:80"
-    volumes:
-      - "./data:/data"
-    environment:
-      HF_TOKEN: ${HF_TOKEN}
-    shm_size: 1g
-    command: --model-id ${LLM_MODEL_ID} --max-input-length ${MAX_INPUT_TOKENS} --max-total-tokens ${MAX_TOTAL_TOKENS}
-  llm:
-    image: opea/llm-docsum-tgi:latest
-    container_name: llm-docsum-tgi-server
-    ports:
-      - "9000:9000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
-      HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
-      MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS}
-      MAX_TOTAL_TOKENS: ${MAX_TOTAL_TOKENS}
-      LLM_MODEL_ID: ${LLM_MODEL_ID}
-    restart: unless-stopped
-
-networks:
-  default:
-    driver: bridge
diff --git a/comps/llms/summarization/tgi/langchain/entrypoint.sh b/comps/llms/summarization/tgi/langchain/entrypoint.sh
deleted file mode 100644
index d60eddd36b..0000000000
--- a/comps/llms/summarization/tgi/langchain/entrypoint.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-pip --no-cache-dir install -r requirements-runtime.txt
-
-python llm.py
diff --git a/comps/llms/summarization/tgi/langchain/llm.py b/comps/llms/summarization/tgi/langchain/llm.py
deleted file mode 100644
index 465e5d26d3..0000000000
--- a/comps/llms/summarization/tgi/langchain/llm.py
+++ /dev/null
@@ -1,245 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-import os
-
-from fastapi.responses import StreamingResponse
-from langchain.chains.summarize import load_summarize_chain
-from langchain.docstore.document import Document
-from langchain.prompts import PromptTemplate
-from langchain_community.llms import HuggingFaceEndpoint
-from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
-from transformers import AutoTokenizer
-
-from comps import CustomLogger, DocSumLLMParams, GeneratedDoc, ServiceType, opea_microservices, register_microservice
-from comps.cores.mega.utils import get_access_token
-
-logger = CustomLogger("llm_docsum")
-logflag = os.getenv("LOGFLAG", False)
-
-# Environment variables
-TOKEN_URL = os.getenv("TOKEN_URL")
-CLIENTID = os.getenv("CLIENTID")
-CLIENT_SECRET = os.getenv("CLIENT_SECRET")
-MAX_INPUT_TOKENS = int(os.getenv("MAX_INPUT_TOKENS", 2048))
-MAX_TOTAL_TOKENS = int(os.getenv("MAX_TOTAL_TOKENS", 4096))
-LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "Intel/neural-chat-7b-v3-3")
-
-templ_en = """Write a concise summary of the following:
-
-
-"{text}"
-
-
-CONCISE SUMMARY:"""
-
-templ_zh = """请简要概括以下内容:
-
-
-"{text}"
-
-
-概况:"""
-
-
-templ_refine_en = """Your job is to produce a final summary.
-We have provided an existing summary up to a certain point, then we will provide more context.
-You need to refine the existing summary (only if needed) with new context and generate a final summary.
-
-
-Existing Summary:
-"{existing_answer}"
-
-
-
-New Context:
-"{text}"
-
-
-
-Final Summary:
-
-"""
-
-templ_refine_zh = """\
-你的任务是生成一个最终摘要。
-我们已经处理好部分文本并生成初始摘要, 并提供了新的未处理文本
-你需要根据新提供的文本，结合初始摘要，生成一个最终摘要。
-
-
-初始摘要:
-"{existing_answer}"
-
-
-
-新的文本:
-"{text}"
-
-
-
-最终摘要:
-
-"""
-
-
-@register_microservice(
-    name="opea_service@llm_docsum",
-    service_type=ServiceType.LLM,
-    endpoint="/v1/chat/docsum",
-    host="0.0.0.0",
-    port=9000,
-)
-async def llm_generate(input: DocSumLLMParams):
-    if logflag:
-        logger.info(input)
-
-    ### check summary type
-    summary_types = ["auto", "stuff", "truncate", "map_reduce", "refine"]
-    if input.summary_type not in summary_types:
-        raise NotImplementedError(f"Please specify the summary_type in {summary_types}")
-    if input.summary_type == "auto":  ### Check input token length in auto mode
-        token_len = len(tokenizer.encode(input.query))
-        if token_len > MAX_INPUT_TOKENS + 50:
-            input.summary_type = "refine"
-            if logflag:
-                logger.info(
-                    f"Input token length {token_len} exceed MAX_INPUT_TOKENS + 50 {MAX_INPUT_TOKENS+50}, auto switch to 'refine' mode."
-                )
-        else:
-            input.summary_type = "stuff"
-            if logflag:
-                logger.info(
-                    f"Input token length {token_len} not exceed MAX_INPUT_TOKENS + 50 {MAX_INPUT_TOKENS+50}, auto switch to 'stuff' mode."
-                )
-
-    if input.language in ["en", "auto"]:
-        templ = templ_en
-        templ_refine = templ_refine_en
-    elif input.language in ["zh"]:
-        templ = templ_zh
-        templ_refine = templ_refine_zh
-    else:
-        raise NotImplementedError('Please specify the input language in "en", "zh", "auto"')
-
-    ## Prompt
-    PROMPT = PromptTemplate.from_template(templ)
-    if input.summary_type == "refine":
-        PROMPT_REFINE = PromptTemplate.from_template(templ_refine)
-    if logflag:
-        logger.info("After prompting:")
-        logger.info(PROMPT)
-        if input.summary_type == "refine":
-            logger.info(PROMPT_REFINE)
-
-    ## Split text
-    if input.summary_type == "stuff":
-        text_splitter = CharacterTextSplitter()
-    else:
-        if input.summary_type == "refine":
-            if MAX_TOTAL_TOKENS <= 2 * input.max_tokens + 128:
-                raise RuntimeError("In Refine mode, Please set MAX_TOTAL_TOKENS larger than (max_tokens * 2 + 128)")
-            max_input_tokens = min(
-                MAX_TOTAL_TOKENS - 2 * input.max_tokens - 128, MAX_INPUT_TOKENS
-            )  # 128 is reserved token length for prompt
-        else:
-            if MAX_TOTAL_TOKENS <= input.max_tokens + 50:
-                raise RuntimeError("Please set MAX_TOTAL_TOKENS larger than max_tokens + 50)")
-            max_input_tokens = min(
-                MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS
-            )  # 50 is reserved token length for prompt
-        chunk_size = min(input.chunk_size, max_input_tokens) if input.chunk_size > 0 else max_input_tokens
-        chunk_overlap = input.chunk_overlap if input.chunk_overlap > 0 else int(0.1 * chunk_size)
-        text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
-            tokenizer=tokenizer, chunk_size=chunk_size, chunk_overlap=chunk_overlap
-        )
-        if logflag:
-            logger.info(f"set chunk size to: {chunk_size}")
-            logger.info(f"set chunk overlap to: {chunk_overlap}")
-
-    texts = text_splitter.split_text(input.query)
-    docs = [Document(page_content=t) for t in texts]
-    if logflag:
-        logger.info(f"Split input query into {len(docs)} chunks")
-        logger.info(f"The character length of the first chunk is {len(texts[0])}")
-
-    ## Access auth
-    access_token = (
-        get_access_token(TOKEN_URL, CLIENTID, CLIENT_SECRET) if TOKEN_URL and CLIENTID and CLIENT_SECRET else None
-    )
-    server_kwargs = {}
-    if access_token:
-        server_kwargs["headers"] = {"Authorization": f"Bearer {access_token}"}
-
-    ## LLM
-    if input.streaming and input.summary_type == "map_reduce":
-        logger.info("Map Reduce mode don't support streaming=True, set to streaming=False")
-        input.streaming = False
-    llm_endpoint = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080")
-    llm = HuggingFaceEndpoint(
-        endpoint_url=llm_endpoint,
-        max_new_tokens=input.max_tokens,
-        top_k=input.top_k,
-        top_p=input.top_p,
-        typical_p=input.typical_p,
-        temperature=input.temperature,
-        repetition_penalty=input.repetition_penalty,
-        streaming=input.streaming,
-        server_kwargs=server_kwargs,
-    )
-
-    ## LLM chain
-    summary_type = input.summary_type
-    if summary_type == "stuff":
-        llm_chain = load_summarize_chain(llm=llm, prompt=PROMPT)
-    elif summary_type == "truncate":
-        docs = [docs[0]]
-        llm_chain = load_summarize_chain(llm=llm, prompt=PROMPT)
-    elif summary_type == "map_reduce":
-        llm_chain = load_summarize_chain(
-            llm=llm, map_prompt=PROMPT, combine_prompt=PROMPT, chain_type="map_reduce", return_intermediate_steps=True
-        )
-    elif summary_type == "refine":
-        llm_chain = load_summarize_chain(
-            llm=llm,
-            question_prompt=PROMPT,
-            refine_prompt=PROMPT_REFINE,
-            chain_type="refine",
-            return_intermediate_steps=True,
-        )
-    else:
-        raise NotImplementedError('Please specify the summary_type in "stuff", "truncate", "map_reduce", "refine"')
-
-    if input.streaming:
-
-        async def stream_generator():
-            from langserve.serialization import WellKnownLCSerializer
-
-            _serializer = WellKnownLCSerializer()
-            async for chunk in llm_chain.astream_log(docs):
-                data = _serializer.dumps({"ops": chunk.ops}).decode("utf-8")
-                if logflag:
-                    logger.info(data)
-                yield f"data: {data}\n\n"
-            yield "data: [DONE]\n\n"
-
-        return StreamingResponse(stream_generator(), media_type="text/event-stream")
-    else:
-        response = await llm_chain.ainvoke(docs)
-
-        if input.summary_type in ["map_reduce", "refine"]:
-            intermediate_steps = response["intermediate_steps"]
-            if logflag:
-                logger.info("intermediate_steps:")
-                logger.info(intermediate_steps)
-
-        output_text = response["output_text"]
-        if logflag:
-            logger.info("\n\noutput_text:")
-            logger.info(output_text)
-
-        return GeneratedDoc(text=output_text, prompt=input.query)
-
-
-if __name__ == "__main__":
-    tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_ID)
-    opea_microservices["opea_service@llm_docsum"].start()
diff --git a/comps/llms/summarization/vllm/langchain/Dockerfile b/comps/llms/summarization/vllm/langchain/Dockerfile
deleted file mode 100644
index 3a1cd5a8f7..0000000000
--- a/comps/llms/summarization/vllm/langchain/Dockerfile
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-FROM python:3.11-slim
-
-ARG ARCH="cpu"
-
-RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
-    libgl1-mesa-glx \
-    libjemalloc-dev
-
-RUN useradd -m -s /bin/bash user && \
-    mkdir -p /home/user && \
-    chown -R user /home/user/
-
-USER user
-
-COPY comps /home/user/comps
-
-RUN pip install --no-cache-dir --upgrade pip setuptools && \
-    if [ ${ARCH} = "cpu" ]; then pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu; fi && \
-    pip install --no-cache-dir -r /home/user/comps/llms/summarization/vllm/langchain/requirements.txt
-
-ENV PYTHONPATH=$PYTHONPATH:/home/user
-
-WORKDIR /home/user/comps/llms/summarization/vllm/langchain
-
-ENTRYPOINT ["bash", "entrypoint.sh"]
diff --git a/comps/llms/summarization/vllm/langchain/README.md b/comps/llms/summarization/vllm/langchain/README.md
deleted file mode 100644
index e0d591b69e..0000000000
--- a/comps/llms/summarization/vllm/langchain/README.md
+++ /dev/null
@@ -1,171 +0,0 @@
-# Document Summary vLLM Microservice
-
-This microservice leverages LangChain to implement summarization strategies and facilitate LLM inference using vLLM.
-[vLLM](https://github.com/vllm-project/vllm) is a fast and easy-to-use library for LLM inference and serving, it delivers state-of-the-art serving throughput with a set of advanced features such as PagedAttention, Continuous batching and etc.. Besides GPUs, vLLM already supported [Intel CPUs](https://www.intel.com/content/www/us/en/products/overview.html) and [Gaudi accelerators](https://habana.ai/products).
-
-## 🚀1. Start Microservice with Python 🐍 (Option 1)
-
-To start the LLM microservice, you need to install python packages first.
-
-### 1.1 Install Requirements
-
-```bash
-pip install -r requirements.txt
-```
-
-### 1.2 Start LLM Service
-
-```bash
-export HF_TOKEN=${your_hf_api_token}
-export LLM_MODEL_ID=${your_hf_llm_model}
-docker run -p 8008:80 -v ./data:/data --name llm-docsum-vllm --shm-size 1g opea/vllm-gaudi:latest --model-id ${LLM_MODEL_ID}
-```
-
-### 1.3 Verify the vLLM Service
-
-```bash
-curl http://${your_ip}:8008/v1/chat/completions \
-    -X POST \
-    -H "Content-Type: application/json" \
-    -d '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning? "}]}'
-```
-
-### 1.4 Start LLM Service with Python Script
-
-```bash
-export vLLM_ENDPOINT="http://${your_ip}:8008"
-python llm.py
-```
-
-## 🚀2. Start Microservice with Docker 🐳 (Option 2)
-
-If you start an LLM microservice with docker, the `docker_compose_llm.yaml` file will automatically start a vLLM/vLLM service with docker.
-
-To setup or build the vLLM image follow the instructions provided in [vLLM Gaudi](https://github.com/opea-project/GenAIComps/tree/main/comps/llms/text-generation/vllm/langchain#22-vllm-on-gaudi)
-
-### 2.1 Setup Environment Variables
-
-In order to start vLLM and LLM services, you need to setup the following environment variables first.
-
-```bash
-export HF_TOKEN=${your_hf_api_token}
-export vLLM_ENDPOINT="http://${your_ip}:8008"
-export LLM_MODEL_ID=${your_hf_llm_model}
-```
-
-### 2.2 Build Docker Image
-
-```bash
-cd ../../../../../
-docker build -t opea/llm-docsum-vllm:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/summarization/vllm/langchain/Dockerfile .
-```
-
-To start a docker container, you have two options:
-
-- A. Run Docker with CLI
-- B. Run Docker with Docker Compose
-
-You can choose one as needed.
-
-### 2.3 Run Docker with CLI (Option A)
-
-```bash
-docker run -d --name="llm-docsum-vllm-server" -p 9000:9000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e vLLM_ENDPOINT=$vLLM_ENDPOINT -e HF_TOKEN=$HF_TOKEN opea/llm-docsum-vllm:latest
-```
-
-### 2.4 Run Docker with Docker Compose (Option B)
-
-```bash
-docker compose -f docker_compose_llm.yaml up -d
-```
-
-## 🚀3. Consume LLM Service
-
-### 3.1 Check Service Status
-
-```bash
-curl http://${your_ip}:9000/v1/health_check\
-  -X GET \
-  -H 'Content-Type: application/json'
-```
-
-### 3.2 Consume LLM Service
-
-In DocSum microservice, except for basic LLM parameters, we also support several optimization parameters setting.
-
-- "language": specify the language, can be "auto", "en", "zh", default is "auto"
-
-If you want to deal with long context, can select suitable summary type, details in section 3.2.2.
-
-- "summary_type": can be "auto", "stuff", "truncate", "map_reduce", "refine", default is "auto"
-- "chunk_size": max token length for each chunk. Set to be different default value according to "summary_type".
-- "chunk_overlap": overlap token length between each chunk, default is 0.1\*chunk_size
-
-#### 3.2.1 Basic usage
-
-```bash
-# Enable streaming to receive a streaming response. By default, this is set to True.
-curl http://${your_ip}:9000/v1/chat/docsum \
-  -X POST \
-  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en"}' \
-  -H 'Content-Type: application/json'
-
-# Disable streaming to receive a non-streaming response.
-curl http://${your_ip}:9000/v1/chat/docsum \
-  -X POST \
-  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "streaming":false}' \
-  -H 'Content-Type: application/json'
-
-# Use Chinese mode
-curl http://${your_ip}:9000/v1/chat/docsum \
-  -X POST \
-  -d '{"query":"2024年9月26日，北京——今日，英特尔正式发布英特尔® 至强® 6性能核处理器（代号Granite Rapids），为AI、数据分析、科学计算等计算密集型业务提供卓越性能。", "max_tokens":32, "language":"zh", "streaming":false}' \
-  -H 'Content-Type: application/json'
-```
-
-#### 3.2.2 Long context summarization with "summary_type"
-
-**summary_type=auto**
-
-"summary_type" is set to be "auto" by default, in this mode we will check input token length, if it exceed `MAX_INPUT_TOKENS`, `summary_type` will automatically be set to `refine` mode, otherwise will be set to `stuff` mode.
-
-**summary_type=stuff**
-
-In this mode LLM generate summary based on complete input text. In this case please carefully set `MAX_INPUT_TOKENS` and `MAX_TOTAL_TOKENS` according to your model and device memory, otherwise it may exceed LLM context limit and raise error when meet long context.
-
-**summary_type=truncate**
-
-Truncate mode will truncate the input text and keep only the first chunk, whose length is equal to `min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)`
-
-```bash
-curl http://${your_ip}:9000/v1/chat/docsum \
-  -X POST \
-  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "truncate", "chunk_size": 2000}' \
-  -H 'Content-Type: application/json'
-```
-
-**summary_type=map_reduce**
-
-Map_reduce mode will split the inputs into multiple chunks, map each document to an individual summary, then consolidate those summaries into a single global summary. `streaming=True` is not allowed here.
-
-In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)`
-
-```bash
-curl http://${your_ip}:9000/v1/chat/docsum \
-  -X POST \
-  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "streaming":false}' \
-  -H 'Content-Type: application/json'
-```
-
-**summary_type=refine**
-
-Refin mode will split the inputs into multiple chunks, generate summary for the first one, then combine with the second, loops over every remaining chunks to get the final summary.
-
-In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - 2 * input.max_tokens - 128, MAX_INPUT_TOKENS)`.
-
-```bash
-curl http://${your_ip}:9000/v1/chat/docsum \
-  -X POST \
-  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000}' \
-  -H 'Content-Type: application/json'
-```
diff --git a/comps/llms/summarization/vllm/langchain/__init__.py b/comps/llms/summarization/vllm/langchain/__init__.py
deleted file mode 100644
index 916f3a44b2..0000000000
--- a/comps/llms/summarization/vllm/langchain/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
diff --git a/comps/llms/summarization/vllm/langchain/llm.py b/comps/llms/summarization/vllm/langchain/llm.py
deleted file mode 100644
index f134a75a55..0000000000
--- a/comps/llms/summarization/vllm/langchain/llm.py
+++ /dev/null
@@ -1,247 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-import os
-from pathlib import Path as p
-
-from fastapi.responses import StreamingResponse
-from langchain.chains.summarize import load_summarize_chain
-from langchain.docstore.document import Document
-from langchain.prompts import PromptTemplate
-from langchain_community.llms import VLLMOpenAI
-from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
-from transformers import AutoTokenizer
-
-from comps import CustomLogger, DocSumLLMParams, GeneratedDoc, ServiceType, opea_microservices, register_microservice
-from comps.cores.mega.utils import get_access_token
-
-logger = CustomLogger("llm_docsum")
-logflag = os.getenv("LOGFLAG", False)
-
-# Environment variables
-TOKEN_URL = os.getenv("TOKEN_URL")
-CLIENTID = os.getenv("CLIENTID")
-CLIENT_SECRET = os.getenv("CLIENT_SECRET")
-MAX_INPUT_TOKENS = int(os.getenv("MAX_INPUT_TOKENS"))
-MAX_TOTAL_TOKENS = int(os.getenv("MAX_TOTAL_TOKENS"))
-LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", None)
-
-templ_en = """Write a concise summary of the following:
-
-
-"{text}"
-
-
-CONCISE SUMMARY:"""
-
-templ_zh = """请简要概括以下内容:
-
-
-"{text}"
-
-
-概况:"""
-
-
-templ_refine_en = """Your job is to produce a final summary.
-We have provided an existing summary up to a certain point, then we will provide more context.
-You need to refine the existing summary (only if needed) with new context and generate a final summary.
-
-
-Existing Summary:
-"{existing_answer}"
-
-
-
-New Context:
-"{text}"
-
-
-
-Final Summary:
-
-"""
-
-templ_refine_zh = """\
-你的任务是生成一个最终摘要。
-我们已经处理好部分文本并生成初始摘要, 并提供了新的未处理文本
-你需要根据新提供的文本，结合初始摘要，生成一个最终摘要。
-
-
-初始摘要:
-"{existing_answer}"
-
-
-
-新的文本:
-"{text}"
-
-
-
-最终摘要:
-
-"""
-
-
-@register_microservice(
-    name="opea_service@llm_docsum",
-    service_type=ServiceType.LLM,
-    endpoint="/v1/chat/docsum",
-    host="0.0.0.0",
-    port=9000,
-)
-async def llm_generate(input: DocSumLLMParams):
-    if logflag:
-        logger.info(input)
-
-    ### check summary type
-    summary_types = ["auto", "stuff", "truncate", "map_reduce", "refine"]
-    if input.summary_type not in summary_types:
-        raise NotImplementedError(f"Please specify the summary_type in {summary_types}")
-    if input.summary_type == "auto":  ### Check input token length in auto mode
-        token_len = len(tokenizer.encode(input.query))
-        if token_len > MAX_INPUT_TOKENS + 50:
-            input.summary_type = "refine"
-            if logflag:
-                logger.info(
-                    f"Input token length {token_len} exceed MAX_INPUT_TOKENS + 50 {MAX_INPUT_TOKENS+50}, auto switch to 'refine' mode."
-                )
-        else:
-            input.summary_type = "stuff"
-            if logflag:
-                logger.info(
-                    f"Input token length {token_len} not exceed MAX_INPUT_TOKENS + 50 {MAX_INPUT_TOKENS+50}, auto switch to 'stuff' mode."
-                )
-
-    if input.language in ["en", "auto"]:
-        templ = templ_en
-        templ_refine = templ_refine_en
-    elif input.language in ["zh"]:
-        templ = templ_zh
-        templ_refine = templ_refine_zh
-    else:
-        raise NotImplementedError('Please specify the input language in "en", "zh", "auto"')
-
-    ## Prompt
-    PROMPT = PromptTemplate.from_template(templ)
-    if input.summary_type == "refine":
-        PROMPT_REFINE = PromptTemplate.from_template(templ_refine)
-    if logflag:
-        logger.info("After prompting:")
-        logger.info(PROMPT)
-        if input.summary_type == "refine":
-            logger.info(PROMPT_REFINE)
-
-    ## Split text
-    if input.summary_type == "stuff":
-        text_splitter = CharacterTextSplitter()
-    else:
-        if input.summary_type == "refine":
-            if MAX_TOTAL_TOKENS <= 2 * input.max_tokens + 128:
-                raise RuntimeError("In Refine mode, Please set MAX_TOTAL_TOKENS larger than (max_tokens * 2 + 128)")
-            max_input_tokens = min(
-                MAX_TOTAL_TOKENS - 2 * input.max_tokens - 128, MAX_INPUT_TOKENS
-            )  # 128 is reserved token length for prompt
-        else:
-            if MAX_TOTAL_TOKENS <= input.max_tokens + 50:
-                raise RuntimeError("Please set MAX_TOTAL_TOKENS larger than max_tokens + 50)")
-            max_input_tokens = min(
-                MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS
-            )  # 50 is reserved token length for prompt
-        chunk_size = min(input.chunk_size, max_input_tokens) if input.chunk_size > 0 else max_input_tokens
-        chunk_overlap = input.chunk_overlap if input.chunk_overlap > 0 else int(0.1 * chunk_size)
-        text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
-            tokenizer=tokenizer, chunk_size=chunk_size, chunk_overlap=chunk_overlap
-        )
-        if logflag:
-            logger.info(f"set chunk size to: {chunk_size}")
-            logger.info(f"set chunk overlap to: {chunk_overlap}")
-
-    texts = text_splitter.split_text(input.query)
-    docs = [Document(page_content=t) for t in texts]
-    if logflag:
-        logger.info(f"Split input query into {len(docs)} chunks")
-        logger.info(f"The character length of the first chunk is {len(texts[0])}")
-
-    ## Access auth
-    access_token = (
-        get_access_token(TOKEN_URL, CLIENTID, CLIENT_SECRET) if TOKEN_URL and CLIENTID and CLIENT_SECRET else None
-    )
-    headers = {}
-    if access_token:
-        headers = {"Authorization": f"Bearer {access_token}"}
-
-    ## LLM
-    if input.streaming and input.summary_type == "map_reduce":
-        logger.info("Map Reduce mode don't support streaming=True, set to streaming=False")
-        input.streaming = False
-    llm_endpoint = os.getenv("vLLM_ENDPOINT", "http://localhost:8080")
-    model = input.model if input.model else os.getenv("LLM_MODEL_ID")
-    llm = VLLMOpenAI(
-        openai_api_key="EMPTY",
-        openai_api_base=llm_endpoint + "/v1",
-        model_name=model,
-        default_headers=headers,
-        max_tokens=input.max_tokens,
-        top_p=input.top_p,
-        streaming=input.streaming,
-        temperature=input.temperature,
-        presence_penalty=input.repetition_penalty,
-    )
-
-    ## LLM chain
-    summary_type = input.summary_type
-    if summary_type == "stuff":
-        llm_chain = load_summarize_chain(llm=llm, prompt=PROMPT)
-    elif summary_type == "truncate":
-        docs = [docs[0]]
-        llm_chain = load_summarize_chain(llm=llm, prompt=PROMPT)
-    elif summary_type == "map_reduce":
-        llm_chain = load_summarize_chain(
-            llm=llm, map_prompt=PROMPT, combine_prompt=PROMPT, chain_type="map_reduce", return_intermediate_steps=True
-        )
-    elif summary_type == "refine":
-        llm_chain = load_summarize_chain(
-            llm=llm,
-            question_prompt=PROMPT,
-            refine_prompt=PROMPT_REFINE,
-            chain_type="refine",
-            return_intermediate_steps=True,
-        )
-    else:
-        raise NotImplementedError('Please specify the summary_type in "stuff", "truncate", "map_reduce", "refine"')
-
-    if input.streaming:
-
-        async def stream_generator():
-            from langserve.serialization import WellKnownLCSerializer
-
-            _serializer = WellKnownLCSerializer()
-            async for chunk in llm_chain.astream_log(docs):
-                data = _serializer.dumps({"ops": chunk.ops}).decode("utf-8")
-                if logflag:
-                    logger.info(data)
-                yield f"data: {data}\n\n"
-            yield "data: [DONE]\n\n"
-
-        return StreamingResponse(stream_generator(), media_type="text/event-stream")
-    else:
-        response = await llm_chain.ainvoke(docs)
-
-        if input.summary_type in ["map_reduce", "refine"]:
-            intermediate_steps = response["intermediate_steps"]
-            if logflag:
-                logger.info("intermediate_steps:")
-                logger.info(intermediate_steps)
-
-        output_text = response["output_text"]
-        if logflag:
-            logger.info("\n\noutput_text:")
-            logger.info(output_text)
-
-        return GeneratedDoc(text=output_text, prompt=input.query)
-
-
-if __name__ == "__main__":
-    tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_ID)
-    opea_microservices["opea_service@llm_docsum"].start()
diff --git a/comps/llms/summarization/vllm/langchain/requirements-runtime.txt b/comps/llms/summarization/vllm/langchain/requirements-runtime.txt
deleted file mode 100644
index 225adde271..0000000000
--- a/comps/llms/summarization/vllm/langchain/requirements-runtime.txt
+++ /dev/null
@@ -1 +0,0 @@
-langserve
diff --git a/comps/llms/summarization/vllm/langchain/requirements.txt b/comps/llms/summarization/vllm/langchain/requirements.txt
deleted file mode 100644
index 1694618637..0000000000
--- a/comps/llms/summarization/vllm/langchain/requirements.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-docarray[full]
-fastapi
-httpx==0.27.2
-huggingface_hub
-langchain #==0.1.12
-langchain-huggingface
-langchain-openai
-langchain_community
-langchainhub
-opentelemetry-api
-opentelemetry-exporter-otlp
-opentelemetry-sdk
-prometheus-fastapi-instrumentator
-shortuuid
-transformers
-uvicorn

From f15fdf9e0da52173f977a13f7ce29ef585cb4e00 Mon Sep 17 00:00:00 2001
From: Xinyao Wang <xinyao.wang@intel.com>
Date: Tue, 31 Dec 2024 15:35:27 +0800
Subject: [PATCH 02/23] vllm input

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>
---
 .../docker_compose/doc-summarization_vllm_on_intel_hpu.yaml    | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/comps/llms/deployment/docker_compose/doc-summarization_vllm_on_intel_hpu.yaml b/comps/llms/deployment/docker_compose/doc-summarization_vllm_on_intel_hpu.yaml
index 4f4d836b84..7252210b23 100644
--- a/comps/llms/deployment/docker_compose/doc-summarization_vllm_on_intel_hpu.yaml
+++ b/comps/llms/deployment/docker_compose/doc-summarization_vllm_on_intel_hpu.yaml
@@ -19,6 +19,7 @@ services:
       HABANA_VISIBLE_DEVICES: all
       OMPI_MCA_btl_vader_single_copy_mechanism: none
       LLM_MODEL_ID: ${LLM_MODEL_ID}
+      MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS}
       VLLM_TORCH_PROFILER_DIR: "/mnt"
       host_ip: ${host_ip}
       LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
@@ -32,7 +33,7 @@ services:
       interval: 10s
       timeout: 10s
       retries: 100
-    command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
+    command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq-len-to-capture ${MAX_INPUT_TOKENS}
   llm:
     image: opea/llm-docsum:latest
     container_name: llm-docsum-server

From 2251a0fd26fc178248e461d7bb8926b93e13617f Mon Sep 17 00:00:00 2001
From: Xinyao Wang <xinyao.wang@intel.com>
Date: Thu, 2 Jan 2025 15:28:38 +0800
Subject: [PATCH 03/23] refine ut for docsum

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>
---
 .../docker_compose/doc-summarization_tgi.yaml |   1 +
 .../doc-summarization_tgi_on_intel_hpu.yaml   |   1 +
 .../doc-summarization_vllm.yaml               |   1 +
 .../doc-summarization_vllm_on_intel_hpu.yaml  |   1 +
 comps/llms/src/doc-summarization/README.md    |   2 +-
 tests/llms/test_llms_doc-summarization_tgi.sh | 153 ++++++++++++++++
 ...llms_doc-summarization_tgi_on_intel_hpu.sh | 153 ++++++++++++++++
 .../llms/test_llms_doc-summarization_vllm.sh  | 165 +++++++++++++++++
 ...lms_doc-summarization_vllm_on_intel_hpu.sh | 166 ++++++++++++++++++
 .../test_llms_summarization_tgi_langchain.sh  | 133 --------------
 10 files changed, 642 insertions(+), 134 deletions(-)
 create mode 100644 tests/llms/test_llms_doc-summarization_tgi.sh
 create mode 100644 tests/llms/test_llms_doc-summarization_tgi_on_intel_hpu.sh
 create mode 100644 tests/llms/test_llms_doc-summarization_vllm.sh
 create mode 100644 tests/llms/test_llms_doc-summarization_vllm_on_intel_hpu.sh
 delete mode 100644 tests/llms/test_llms_summarization_tgi_langchain.sh

diff --git a/comps/llms/deployment/docker_compose/doc-summarization_tgi.yaml b/comps/llms/deployment/docker_compose/doc-summarization_tgi.yaml
index af1caf6fb0..e1238908d5 100644
--- a/comps/llms/deployment/docker_compose/doc-summarization_tgi.yaml
+++ b/comps/llms/deployment/docker_compose/doc-summarization_tgi.yaml
@@ -44,6 +44,7 @@ services:
       LLM_ENDPOINT: ${LLM_ENDPOINT}
       LLM_MODEL_ID: ${LLM_MODEL_ID}
       LLM_BACKEND: ${LLM_BACKEND}
+      LOGFLAG: ${LOGFLAG:-False}
     restart: unless-stopped
 
 networks:
diff --git a/comps/llms/deployment/docker_compose/doc-summarization_tgi_on_intel_hpu.yaml b/comps/llms/deployment/docker_compose/doc-summarization_tgi_on_intel_hpu.yaml
index 7afe3fda82..47de4b6739 100644
--- a/comps/llms/deployment/docker_compose/doc-summarization_tgi_on_intel_hpu.yaml
+++ b/comps/llms/deployment/docker_compose/doc-summarization_tgi_on_intel_hpu.yaml
@@ -55,6 +55,7 @@ services:
       LLM_ENDPOINT: ${LLM_ENDPOINT}
       LLM_MODEL_ID: ${LLM_MODEL_ID}
       LLM_BACKEND: ${LLM_BACKEND}
+      LOGFLAG: ${LOGFLAG:-False}
     restart: unless-stopped
 
 networks:
diff --git a/comps/llms/deployment/docker_compose/doc-summarization_vllm.yaml b/comps/llms/deployment/docker_compose/doc-summarization_vllm.yaml
index 9671fff53b..ffae9597f8 100644
--- a/comps/llms/deployment/docker_compose/doc-summarization_vllm.yaml
+++ b/comps/llms/deployment/docker_compose/doc-summarization_vllm.yaml
@@ -47,6 +47,7 @@ services:
       LLM_ENDPOINT: ${LLM_ENDPOINT}
       LLM_MODEL_ID: ${LLM_MODEL_ID}
       LLM_BACKEND: ${LLM_BACKEND}
+      LOGFLAG: ${LOGFLAG:-False}
     restart: unless-stopped
 
 networks:
diff --git a/comps/llms/deployment/docker_compose/doc-summarization_vllm_on_intel_hpu.yaml b/comps/llms/deployment/docker_compose/doc-summarization_vllm_on_intel_hpu.yaml
index 7252210b23..e0040eaece 100644
--- a/comps/llms/deployment/docker_compose/doc-summarization_vllm_on_intel_hpu.yaml
+++ b/comps/llms/deployment/docker_compose/doc-summarization_vllm_on_intel_hpu.yaml
@@ -53,6 +53,7 @@ services:
       LLM_ENDPOINT: ${LLM_ENDPOINT}
       LLM_MODEL_ID: ${LLM_MODEL_ID}
       LLM_BACKEND: ${LLM_BACKEND}
+      LOGFLAG: ${LOGFLAG:-False}
     restart: unless-stopped
 
 networks:
diff --git a/comps/llms/src/doc-summarization/README.md b/comps/llms/src/doc-summarization/README.md
index 4dd8b664e6..e7389460b0 100644
--- a/comps/llms/src/doc-summarization/README.md
+++ b/comps/llms/src/doc-summarization/README.md
@@ -11,7 +11,7 @@ In order to start DocSum services, you need to setup the following environment v
 ```bash
 export host_ip=${your_host_ip}
 export LLM_ENDPOINT_PORT=8008
-export FAQ_PORT=9000
+export DOCSUM_PORT=9000
 export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
 export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}"
 export LLM_MODEL_ID=${your_hf_llm_model}
diff --git a/tests/llms/test_llms_doc-summarization_tgi.sh b/tests/llms/test_llms_doc-summarization_tgi.sh
new file mode 100644
index 0000000000..fd677c1871
--- /dev/null
+++ b/tests/llms/test_llms_doc-summarization_tgi.sh
@@ -0,0 +1,153 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set -xe
+
+WORKPATH=$(dirname "$PWD")
+host_ip=$(hostname -I | awk '{print $1}')
+LOG_PATH="$WORKPATH/tests"
+
+function build_docker_images() {
+    cd $WORKPATH
+    docker build --no-cache -t opea/llm-docsum:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/doc-summarization/Dockerfile .
+    if [ $? -ne 0 ]; then
+        echo "opea/llm-docsum built fail"
+        exit 1
+    else
+        echo "opea/llm-docsum built successful"
+    fi
+}
+
+function start_service() {
+    export host_ip=${host_ip}
+    export LLM_ENDPOINT_PORT=5072
+    export DOCSUM_PORT=5073
+    export HUGGINGFACEHUB_API_TOKEN=${HF_TOKEN}
+    export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}"
+    export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+    export MAX_INPUT_TOKENS=2048
+    export MAX_TOTAL_TOKENS=4096
+    export LLM_BACKEND="tgi" # or "vllm"
+    export LOGFLAG=True
+
+    cd $WORKPATH/comps/llms/deployment/docker_compose
+    docker compose -f doc-summarization_tgi.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
+
+    sleep 30s
+}
+
+function validate_services() {
+    local URL="$1"
+    local EXPECTED_RESULT="$2"
+    local SERVICE_NAME="$3"
+    local DOCKER_NAME="$4"
+    local INPUT_DATA="$5"
+
+    local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
+
+    echo "==========================================="
+
+    if [ "$HTTP_STATUS" -eq 200 ]; then
+        echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+
+        local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
+
+        echo $CONTENT
+
+        if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
+            echo "[ $SERVICE_NAME ] Content is as expected."
+        else
+            echo "[ $SERVICE_NAME ] Content does not match the expected result"
+            docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+            exit 1
+        fi
+    else
+        echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+        docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+        exit 1
+    fi
+    sleep 1s
+}
+
+function validate_microservices() {
+    DOCSUM_PORT=5076
+    URL="http://${host_ip}:$DOCSUM_PORT/v1/chat/docsum"
+
+    echo "Validate tgi..."
+    validate_services \
+        "${LLM_ENDPOINT}/generate" \
+        "generated_text" \
+        "tgi" \
+        "tgi-server" \
+        '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
+
+    echo "Validate stream=True..."
+    validate_services \
+        "$URL" \
+        'text' \
+        "llm_summarization" \
+        "llm-docsum-server" \
+        '{"query": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en"}'
+
+    echo "Validate stream=False..."
+    validate_services \
+        "$URL" \
+        'text' \
+        "llm_summarization" \
+        "llm-docsum-server" \
+        '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "streaming":false}'
+
+    echo "Validate Chinese mode..."
+    validate_services \
+        "$URL" \
+        'text' \
+        "llm_summarization" \
+        "llm-docsum-server" \
+        '{"query":"2024年9月26日，北京——今日，英特尔正式发布英特尔® 至强® 6性能核处理器（代号Granite Rapids），为AI、数据分析、科学计算等计算密集型业务提供卓越性能。", "max_tokens":32, "language":"zh", "streaming":false}'
+       
+    echo "Validate truncate mode..."
+    validate_services \
+        "$URL" \
+        'text' \
+        "llm_summarization" \
+        "llm-docsum-server" \
+        '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "truncate", "chunk_size": 2000}'
+
+    echo "Validate map_reduce mode..."
+    validate_services \
+        "$URL" \
+        'text' \
+        "llm_summarization" \
+        "llm-docsum-server" \
+        '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "streaming":false}'
+
+    echo "Validate refine mode..."
+    validate_services \
+        "$URL" \
+        'text' \
+        "llm_summarization" \
+        "llm-docsum-server" \
+        '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000}'
+}
+
+function stop_docker() {
+    cd $WORKPATH/comps/llms/deployment/docker_compose
+    docker compose -f doc-summarization_tgi.yaml down
+}
+
+function main() {
+
+    stop_docker
+
+    build_docker_images
+    start_service
+
+    validate_microservices
+
+    stop_docker
+    echo y | docker system prune
+
+}
+
+main
diff --git a/tests/llms/test_llms_doc-summarization_tgi_on_intel_hpu.sh b/tests/llms/test_llms_doc-summarization_tgi_on_intel_hpu.sh
new file mode 100644
index 0000000000..83519bda7a
--- /dev/null
+++ b/tests/llms/test_llms_doc-summarization_tgi_on_intel_hpu.sh
@@ -0,0 +1,153 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set -xe
+
+WORKPATH=$(dirname "$PWD")
+host_ip=$(hostname -I | awk '{print $1}')
+LOG_PATH="$WORKPATH/tests"
+
+function build_docker_images() {
+    cd $WORKPATH
+    docker build --no-cache -t opea/llm-docsum:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/doc-summarization/Dockerfile .
+    if [ $? -ne 0 ]; then
+        echo "opea/llm-docsum built fail"
+        exit 1
+    else
+        echo "opea/llm-docsum built successful"
+    fi
+}
+
+function start_service() {
+    export host_ip=${host_ip}
+    export LLM_ENDPOINT_PORT=5071
+    export DOCSUM_PORT=5072
+    export HUGGINGFACEHUB_API_TOKEN=${HF_TOKEN}
+    export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}"
+    export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+    export MAX_INPUT_TOKENS=2048
+    export MAX_TOTAL_TOKENS=4096
+    export LLM_BACKEND="tgi" # or "vllm"
+    export LOGFLAG=True
+
+    cd $WORKPATH/comps/llms/deployment/docker_compose
+    docker compose -f doc-summarization_tgi_on_intel_hpu.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
+
+    sleep 30s
+}
+
+function validate_services() {
+    local URL="$1"
+    local EXPECTED_RESULT="$2"
+    local SERVICE_NAME="$3"
+    local DOCKER_NAME="$4"
+    local INPUT_DATA="$5"
+
+    local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
+
+    echo "==========================================="
+
+    if [ "$HTTP_STATUS" -eq 200 ]; then
+        echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+
+        local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
+
+        echo $CONTENT
+
+        if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
+            echo "[ $SERVICE_NAME ] Content is as expected."
+        else
+            echo "[ $SERVICE_NAME ] Content does not match the expected result"
+            docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+            exit 1
+        fi
+    else
+        echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+        docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+        exit 1
+    fi
+    sleep 1s
+}
+
+function validate_microservices() {
+    DOCSUM_PORT=5076
+    URL="http://${host_ip}:$DOCSUM_PORT/v1/chat/docsum"
+
+    echo "Validate tgi..."
+    validate_services \
+        "${LLM_ENDPOINT}/generate" \
+        "generated_text" \
+        "tgi" \
+        "tgi-gaudi-server" \
+        '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
+
+    echo "Validate stream=True..."
+    validate_services \
+        "$URL" \
+        'text' \
+        "llm_summarization" \
+        "llm-docsum-server" \
+        '{"query": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en"}'
+
+    echo "Validate stream=False..."
+    validate_services \
+        "$URL" \
+        'text' \
+        "llm_summarization" \
+        "llm-docsum-server" \
+        '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "streaming":false}'
+
+    echo "Validate Chinese mode..."
+    validate_services \
+        "$URL" \
+        'text' \
+        "llm_summarization" \
+        "llm-docsum-server" \
+        '{"query":"2024年9月26日，北京——今日，英特尔正式发布英特尔® 至强® 6性能核处理器（代号Granite Rapids），为AI、数据分析、科学计算等计算密集型业务提供卓越性能。", "max_tokens":32, "language":"zh", "streaming":false}'
+       
+    echo "Validate truncate mode..."
+    validate_services \
+        "$URL" \
+        'text' \
+        "llm_summarization" \
+        "llm-docsum-server" \
+        '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "truncate", "chunk_size": 2000}'
+
+    echo "Validate map_reduce mode..."
+    validate_services \
+        "$URL" \
+        'text' \
+        "llm_summarization" \
+        "llm-docsum-server" \
+        '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "streaming":false}'
+
+    echo "Validate refine mode..."
+    validate_services \
+        "$URL" \
+        'text' \
+        "llm_summarization" \
+        "llm-docsum-server" \
+        '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000}'
+}
+
+function stop_docker() {
+    cd $WORKPATH/comps/llms/deployment/docker_compose
+    docker compose -f doc-summarization_tgi_on_intel_hpu.yaml down
+}
+
+function main() {
+
+    stop_docker
+
+    build_docker_images
+    start_service
+
+    validate_microservices
+
+    stop_docker
+    echo y | docker system prune
+
+}
+
+main
diff --git a/tests/llms/test_llms_doc-summarization_vllm.sh b/tests/llms/test_llms_doc-summarization_vllm.sh
new file mode 100644
index 0000000000..be9286051f
--- /dev/null
+++ b/tests/llms/test_llms_doc-summarization_vllm.sh
@@ -0,0 +1,165 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set -xe
+
+WORKPATH=$(dirname "$PWD")
+host_ip=$(hostname -I | awk '{print $1}')
+LOG_PATH="$WORKPATH/tests"
+
+function build_docker_images() {
+    cd $WORKPATH
+    git clone https://github.com/vllm-project/vllm.git
+    cd ./vllm/
+    docker build -f Dockerfile.cpu -t opea/vllm:latest --shm-size=128g --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy .
+    if [ $? -ne 0 ]; then
+        echo "opea/vllm built fail"
+        exit 1
+    else
+        echo "opea/vllm built successful"
+    fi
+
+    cd $WORKPATH
+    docker build --no-cache -t opea/llm-docsum:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/doc-summarization/Dockerfile .
+    if [ $? -ne 0 ]; then
+        echo "opea/llm-docsum built fail"
+        exit 1
+    else
+        echo "opea/llm-docsum built successful"
+    fi
+}
+
+function start_service() {
+    export host_ip=${host_ip}
+    export LLM_ENDPOINT_PORT=5074
+    export DOCSUM_PORT=5075
+    export HUGGINGFACEHUB_API_TOKEN=${HF_TOKEN}
+    export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}"
+    export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+    export MAX_INPUT_TOKENS=2048
+    export MAX_TOTAL_TOKENS=4096
+    export LLM_BACKEND="vllm" # or "vllm"
+    export VLLM_SKIP_WARMUP=true
+    export LOGFLAG=True
+
+    cd $WORKPATH/comps/llms/deployment/docker_compose
+    docker compose -f doc-summarization_vllm.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
+
+    sleep 30s
+}
+
+function validate_services() {
+    local URL="$1"
+    local EXPECTED_RESULT="$2"
+    local SERVICE_NAME="$3"
+    local DOCKER_NAME="$4"
+    local INPUT_DATA="$5"
+
+    local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
+
+    echo "==========================================="
+
+    if [ "$HTTP_STATUS" -eq 200 ]; then
+        echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+
+        local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
+
+        echo $CONTENT
+
+        if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
+            echo "[ $SERVICE_NAME ] Content is as expected."
+        else
+            echo "[ $SERVICE_NAME ] Content does not match the expected result"
+            docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+            exit 1
+        fi
+    else
+        echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+        docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+        exit 1
+    fi
+    sleep 1s
+}
+
+function validate_microservices() {
+    DOCSUM_PORT=5076
+    URL="http://${host_ip}:$DOCSUM_PORT/v1/chat/docsum"
+
+    echo "Validate vllm..."
+    validate_services \
+        "${LLM_ENDPOINT}/v1/completions" \
+        "text" \
+        "vllm" \
+        "vllm-server" \
+        '{"model": "Intel/neural-chat-7b-v3-3", "prompt": "What is Deep Learning?", "max_tokens": 32, "temperature": 0}'
+
+    echo "Validate stream=True..."
+    validate_services \
+        "$URL" \
+        'text' \
+        "llm_summarization" \
+        "llm-docsum-server" \
+        '{"query": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en"}'
+
+    echo "Validate stream=False..."
+    validate_services \
+        "$URL" \
+        'text' \
+        "llm_summarization" \
+        "llm-docsum-server" \
+        '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "streaming":false}'
+
+    echo "Validate Chinese mode..."
+    validate_services \
+        "$URL" \
+        'text' \
+        "llm_summarization" \
+        "llm-docsum-server" \
+        '{"query":"2024年9月26日，北京——今日，英特尔正式发布英特尔® 至强® 6性能核处理器（代号Granite Rapids），为AI、数据分析、科学计算等计算密集型业务提供卓越性能。", "max_tokens":32, "language":"zh", "streaming":false}'
+       
+    echo "Validate truncate mode..."
+    validate_services \
+        "$URL" \
+        'text' \
+        "llm_summarization" \
+        "llm-docsum-server" \
+        '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "truncate", "chunk_size": 2000}'
+
+    echo "Validate map_reduce mode..."
+    validate_services \
+        "$URL" \
+        'text' \
+        "llm_summarization" \
+        "llm-docsum-server" \
+        '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "streaming":false}'
+
+    echo "Validate refine mode..."
+    validate_services \
+        "$URL" \
+        'text' \
+        "llm_summarization" \
+        "llm-docsum-server" \
+        '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000}'
+}
+
+function stop_docker() {
+    cd $WORKPATH/comps/llms/deployment/docker_compose
+    docker compose -f doc-summarization_vllm.yaml down
+}
+
+function main() {
+
+    stop_docker
+
+    build_docker_images
+    start_service
+
+    validate_microservices
+
+    stop_docker
+    echo y | docker system prune
+
+}
+
+main
diff --git a/tests/llms/test_llms_doc-summarization_vllm_on_intel_hpu.sh b/tests/llms/test_llms_doc-summarization_vllm_on_intel_hpu.sh
new file mode 100644
index 0000000000..acbde4b1c1
--- /dev/null
+++ b/tests/llms/test_llms_doc-summarization_vllm_on_intel_hpu.sh
@@ -0,0 +1,166 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set -xe
+
+WORKPATH=$(dirname "$PWD")
+host_ip=$(hostname -I | awk '{print $1}')
+LOG_PATH="$WORKPATH/tests"
+
+function build_docker_images() {
+    cd $WORKPATH
+    git clone https://github.com/HabanaAI/vllm-fork.git
+    cd vllm-fork/
+    git checkout 3c39626
+    docker build --no-cache -f Dockerfile.hpu -t opea/vllm-gaudi:latest --shm-size=128g .
+    if [ $? -ne 0 ]; then
+        echo "opea/vllm-gaudi built fail"
+        exit 1
+    else
+        echo "opea/vllm-gaudi built successful"
+    fi
+
+    cd $WORKPATH
+    docker build --no-cache -t opea/llm-docsum:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/doc-summarization/Dockerfile .
+    if [ $? -ne 0 ]; then
+        echo "opea/llm-docsum built fail"
+        exit 1
+    else
+        echo "opea/llm-docsum built successful"
+    fi
+}
+
+function start_service() {
+    export host_ip=${host_ip}
+    export LLM_ENDPOINT_PORT=5076
+    export DOCSUM_PORT=5077
+    export HUGGINGFACEHUB_API_TOKEN=${HF_TOKEN}
+    export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}"
+    export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+    export MAX_INPUT_TOKENS=2048
+    export MAX_TOTAL_TOKENS=4096
+    export LLM_BACKEND="vllm" # or "vllm"
+    export VLLM_SKIP_WARMUP=true
+    export LOGFLAG=True
+
+    cd $WORKPATH/comps/llms/deployment/docker_compose
+    docker compose -f doc-summarization_vllm_on_intel_hpu.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
+
+    sleep 30s
+}
+
+function validate_services() {
+    local URL="$1"
+    local EXPECTED_RESULT="$2"
+    local SERVICE_NAME="$3"
+    local DOCKER_NAME="$4"
+    local INPUT_DATA="$5"
+
+    local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
+
+    echo "==========================================="
+
+    if [ "$HTTP_STATUS" -eq 200 ]; then
+        echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+
+        local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
+
+        echo $CONTENT
+
+        if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
+            echo "[ $SERVICE_NAME ] Content is as expected."
+        else
+            echo "[ $SERVICE_NAME ] Content does not match the expected result"
+            docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+            exit 1
+        fi
+    else
+        echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+        docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+        exit 1
+    fi
+    sleep 1s
+}
+
+function validate_microservices() {
+    DOCSUM_PORT=5076
+    URL="http://${host_ip}:$DOCSUM_PORT/v1/chat/docsum"
+
+    echo "Validate vllm..."
+    validate_services \
+        "${LLM_ENDPOINT}/v1/completions" \
+        "text" \
+        "vllm" \
+        "vllm-gaudi-server" \
+        '{"model": "Intel/neural-chat-7b-v3-3", "prompt": "What is Deep Learning?", "max_tokens": 32, "temperature": 0}'
+
+    echo "Validate stream=True..."
+    validate_services \
+        "$URL" \
+        'text' \
+        "llm_summarization" \
+        "llm-docsum-server" \
+        '{"query": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en"}'
+
+    echo "Validate stream=False..."
+    validate_services \
+        "$URL" \
+        'text' \
+        "llm_summarization" \
+        "llm-docsum-server" \
+        '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "streaming":false}'
+
+    echo "Validate Chinese mode..."
+    validate_services \
+        "$URL" \
+        'text' \
+        "llm_summarization" \
+        "llm-docsum-server" \
+        '{"query":"2024年9月26日，北京——今日，英特尔正式发布英特尔® 至强® 6性能核处理器（代号Granite Rapids），为AI、数据分析、科学计算等计算密集型业务提供卓越性能。", "max_tokens":32, "language":"zh", "streaming":false}'
+       
+    echo "Validate truncate mode..."
+    validate_services \
+        "$URL" \
+        'text' \
+        "llm_summarization" \
+        "llm-docsum-server" \
+        '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "truncate", "chunk_size": 2000}'
+
+    echo "Validate map_reduce mode..."
+    validate_services \
+        "$URL" \
+        'text' \
+        "llm_summarization" \
+        "llm-docsum-server" \
+        '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "streaming":false}'
+
+    echo "Validate refine mode..."
+    validate_services \
+        "$URL" \
+        'text' \
+        "llm_summarization" \
+        "llm-docsum-server" \
+        '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000}'
+}
+
+function stop_docker() {
+    cd $WORKPATH/comps/llms/deployment/docker_compose
+    docker compose -f doc-summarization_vllm_on_intel_hpu.yaml down
+}
+
+function main() {
+
+    stop_docker
+
+    build_docker_images
+    start_service
+
+    validate_microservices
+
+    stop_docker
+    echo y | docker system prune
+
+}
+
+main
diff --git a/tests/llms/test_llms_summarization_tgi_langchain.sh b/tests/llms/test_llms_summarization_tgi_langchain.sh
deleted file mode 100644
index d805b7361b..0000000000
--- a/tests/llms/test_llms_summarization_tgi_langchain.sh
+++ /dev/null
@@ -1,133 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-
-WORKPATH=$(dirname "$PWD")
-ip_address=$(hostname -I | awk '{print $1}')
-LOG_PATH="$WORKPATH/tests"
-
-function build_docker_images() {
-    cd $WORKPATH
-    docker build --no-cache -t opea/llm-sum-tgi:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/summarization/tgi/langchain/Dockerfile .
-    if [ $? -ne 0 ]; then
-        echo "opea/llm-tgi built fail"
-        exit 1
-    else
-        echo "opea/llm-tgi built successful"
-    fi
-}
-
-function start_service() {
-    tgi_endpoint_port=5075
-    export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
-    export MAX_INPUT_TOKENS=2048
-    export MAX_TOTAL_TOKENS=4096
-    # Remember to set HF_TOKEN before invoking this test!
-    export HF_TOKEN=${HF_TOKEN}
-    docker run -d --name="test-comps-llm-sum-tgi-endpoint" -p $tgi_endpoint_port:80 -v ./data:/data -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 1g ghcr.io/huggingface/text-generation-inference:1.4 --model-id ${LLM_MODEL_ID} --max-input-length ${MAX_INPUT_TOKENS} --max-total-tokens ${MAX_TOTAL_TOKENS}
-    export TGI_LLM_ENDPOINT="http://${ip_address}:${tgi_endpoint_port}"
-
-    sum_port=5076
-    docker run -d --name="test-comps-llm-sum-tgi-server" -p ${sum_port}:9000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e TGI_LLM_ENDPOINT=$TGI_LLM_ENDPOINT -e LLM_MODEL_ID=$LLM_MODEL_ID -e MAX_INPUT_TOKENS=$MAX_INPUT_TOKENS -e MAX_TOTAL_TOKENS=$MAX_TOTAL_TOKENS -e HUGGINGFACEHUB_API_TOKEN=$HF_TOKEN -e LOGFLAG=True opea/llm-sum-tgi:comps
-
-    # check whether tgi is fully ready
-    n=0
-    until [[ "$n" -ge 100 ]] || [[ $ready == true ]]; do
-        docker logs test-comps-llm-sum-tgi-endpoint > ${LOG_PATH}/test-comps-llm-sum-tgi-endpoint.log
-        n=$((n+1))
-        if grep -q Connected ${LOG_PATH}/test-comps-llm-sum-tgi-endpoint.log; then
-            break
-        fi
-        sleep 5s
-    done
-    sleep 5s
-}
-
-function validate_services() {
-    local URL="$1"
-    local EXPECTED_RESULT="$2"
-    local SERVICE_NAME="$3"
-    local DOCKER_NAME="$4"
-    local INPUT_DATA="$5"
-
-    local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
-
-    echo "==========================================="
-
-    if [ "$HTTP_STATUS" -eq 200 ]; then
-        echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
-
-        local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
-
-        echo $CONTENT
-
-        if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
-            echo "[ $SERVICE_NAME ] Content is as expected."
-        else
-            echo "[ $SERVICE_NAME ] Content does not match the expected result"
-            docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
-            exit 1
-        fi
-    else
-        echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
-        docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
-        exit 1
-    fi
-    sleep 1s
-}
-
-function validate_microservices() {
-    sum_port=5076
-    URL="http://${ip_address}:$sum_port/v1/chat/docsum"
-
-    validate_services \
-        "$URL" \
-        'text' \
-        "llm_summarization" \
-        "test-comps-llm-sum-tgi-server" \
-        '{"query": "What is Deep Learning?"}'
-
-    validate_services \
-        "$URL" \
-        'text' \
-        "llm_summarization" \
-        "test-comps-llm-sum-tgi-server" \
-        '{"query": "What is Deep Learning?", "summary_type": "truncate"}'
-
-    validate_services \
-        "$URL" \
-        'text' \
-        "llm_summarization" \
-        "test-comps-llm-sum-tgi-server" \
-        '{"query": "What is Deep Learning?", "summary_type": "map_reduce"}'
-
-    validate_services \
-        "$URL" \
-        'text' \
-        "llm_summarization" \
-        "test-comps-llm-sum-tgi-server" \
-        '{"query": "What is Deep Learning?", "summary_type": "refine"}'
-}
-
-function stop_docker() {
-    cid=$(docker ps -aq --filter "name=test-comps-llm-sum-tgi*")
-    if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi
-}
-
-function main() {
-
-    stop_docker
-
-    build_docker_images
-    start_service
-
-    validate_microservices
-
-    stop_docker
-    echo y | docker system prune
-
-}
-
-main

From e3a88910f7696de0d56bdde38b7ae280186c178e Mon Sep 17 00:00:00 2001
From: Xinyao Wang <xinyao.wang@intel.com>
Date: Thu, 2 Jan 2025 15:40:54 +0800
Subject: [PATCH 04/23] fix docker path for docsum

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>
---
 .github/workflows/docker/compose/llms-compose.yaml | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/docker/compose/llms-compose.yaml b/.github/workflows/docker/compose/llms-compose.yaml
index 2d42e6f46d..3400e3179f 100644
--- a/.github/workflows/docker/compose/llms-compose.yaml
+++ b/.github/workflows/docker/compose/llms-compose.yaml
@@ -11,9 +11,9 @@ services:
     build:
       dockerfile: comps/llms/text-generation/ollama/langchain/Dockerfile
     image: ${REGISTRY:-opea}/llm-ollama:${TAG:-latest}
-  llm-docsum-tgi:
+  llm-docsum:
     build:
-      dockerfile: comps/llms/summarization/tgi/langchain/Dockerfile
+      dockerfile: comps/llms/src/doc-summarization/Dockerfile
     image: ${REGISTRY:-opea}/llm-docsum-tgi:${TAG:-latest}
   llm-faqgen-tgi:
     build:
@@ -54,10 +54,6 @@ services:
     build:
       dockerfile: comps/llms/text-generation/predictionguard/Dockerfile
     image: ${REGISTRY:-opea}/llm-textgen-predictionguard:${TAG:-latest}
-  llm-docsum-vllm:
-    build:
-      dockerfile: comps/llms/summarization/vllm/langchain/Dockerfile
-    image: ${REGISTRY:-opea}/llm-docsum-vllm:${TAG:-latest}
   llm-faqgen-vllm:
     build:
       dockerfile: comps/llms/faq-generation/vllm/langchain/Dockerfile

From 79e3404083bb12fcaf007a2d9f1c21f80f9660c2 Mon Sep 17 00:00:00 2001
From: Xinyao Wang <xinyao.wang@intel.com>
Date: Thu, 2 Jan 2025 15:41:37 +0800
Subject: [PATCH 05/23] for ut, duplicate with faqgen pr, can be removed later

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>
---
 comps/cores/common/component.py | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/comps/cores/common/component.py b/comps/cores/common/component.py
index 2bbd436f21..f16abb9926 100644
--- a/comps/cores/common/component.py
+++ b/comps/cores/common/component.py
@@ -114,17 +114,26 @@ def register(self, component):
         logger.info(f"Registered component: {component.name}")
         self.components[component.name] = component
 
-    def discover_and_activate(self):
+    def discover_and_activate(self, active_comp_name=""):
         """Discovers healthy components and activates one.
 
-        If multiple components are healthy, it prioritizes the first registered component.
+        Attributes:
+        active_comp_name: Specify the component name to be tested first, if not set, it prioritizes the first registered component if multiple components are healthy.
         """
-        for component in self.components.values():
+        if active_comp_name != "" and active_comp_name in self.components.keys():
+            component = self.components[active_comp_name]
             if component.check_health():
                 self.active_component = component
-                logger.info(f"Activated component: {component.name}")
-                return
-        raise RuntimeError("No healthy components available.")
+        else:
+            for component in self.components.values():
+                if component.check_health():
+                    self.active_component = component
+
+        if self.active_component:
+            print(f"Activated component: {self.active_component.name}")
+            return
+        else:
+            raise RuntimeError("No healthy components available.")
 
     async def invoke(self, *args, **kwargs):
         """Invokes service accessing using the active component.

From aa6fbc028e7db4e424f76cde3fca23140f77151c Mon Sep 17 00:00:00 2001
From: Xinyao Wang <xinyao.wang@intel.com>
Date: Thu, 2 Jan 2025 16:28:51 +0800
Subject: [PATCH 06/23] fix bug

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>
---
 comps/llms/deployment/docker_compose/doc-summarization_tgi.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/comps/llms/deployment/docker_compose/doc-summarization_tgi.yaml b/comps/llms/deployment/docker_compose/doc-summarization_tgi.yaml
index e1238908d5..9613c003f8 100644
--- a/comps/llms/deployment/docker_compose/doc-summarization_tgi.yaml
+++ b/comps/llms/deployment/docker_compose/doc-summarization_tgi.yaml
@@ -4,7 +4,7 @@
 version: "3.8"
 
 services:
-    tgi-service:
+  tgi-service:
     image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
     container_name: tgi-server
     ports:

From aabf05916c7ede1cc6fd09c264de74098fcf784f Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 2 Jan 2025 08:30:02 +0000
Subject: [PATCH 07/23] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../doc-summarization/integrations/opea.py    | 30 +++++++++++--------
 .../integrations/template.py                  |  5 +++-
 tests/llms/test_llms_doc-summarization_tgi.sh |  2 +-
 ...llms_doc-summarization_tgi_on_intel_hpu.sh |  2 +-
 .../llms/test_llms_doc-summarization_vllm.sh  |  2 +-
 ...lms_doc-summarization_vllm_on_intel_hpu.sh |  2 +-
 6 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/comps/llms/src/doc-summarization/integrations/opea.py b/comps/llms/src/doc-summarization/integrations/opea.py
index 08beb619d1..ec66658468 100644
--- a/comps/llms/src/doc-summarization/integrations/opea.py
+++ b/comps/llms/src/doc-summarization/integrations/opea.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import os
+
 import requests
 from fastapi.responses import StreamingResponse
 from langchain.chains.summarize import load_summarize_chain
@@ -10,11 +11,12 @@
 from langchain_community.llms import HuggingFaceEndpoint, VLLMOpenAI
 from langchain_core.prompts import PromptTemplate
 from transformers import AutoTokenizer
-from .template import templ_en, templ_zh, templ_refine_en, templ_refine_zh
 
-from comps import CustomLogger, GeneratedDoc, DocSumLLMParams, OpeaComponent, ServiceType
+from comps import CustomLogger, DocSumLLMParams, GeneratedDoc, OpeaComponent, ServiceType
 from comps.cores.mega.utils import ConfigError, get_access_token, load_model_configs
 
+from .template import templ_en, templ_refine_en, templ_refine_zh, templ_zh
+
 logger = CustomLogger("llm_docsum")
 logflag = os.getenv("LOGFLAG", False)
 
@@ -55,6 +57,7 @@ def get_llm_endpoint():
         logger.error(f"Input model {MODEL_NAME} not present in model_configs. Error {e}")
         raise ConfigError(f"Input model {MODEL_NAME} not present in model_configs")
 
+
 class OPEADocSum(OpeaComponent):
     """A specialized OPEA DocSum component derived from OpeaComponent.
 
@@ -121,17 +124,13 @@ async def generate(self, input: DocSumLLMParams, client):
             text_splitter = CharacterTextSplitter()
         else:
             if input.summary_type == "refine":
-                if MAX_TOTAL_TOKENS <= 2 * input.max_tokens + 128: ## 128 is reserved prompt lenght
+                if MAX_TOTAL_TOKENS <= 2 * input.max_tokens + 128:  ## 128 is reserved prompt length
                     raise RuntimeError("In Refine mode, Please set MAX_TOTAL_TOKENS larger than (max_tokens * 2 + 128)")
-                max_input_tokens = min(
-                    MAX_TOTAL_TOKENS - 2 * input.max_tokens - 128, MAX_INPUT_TOKENS
-                ) 
+                max_input_tokens = min(MAX_TOTAL_TOKENS - 2 * input.max_tokens - 128, MAX_INPUT_TOKENS)
             else:
-                if MAX_TOTAL_TOKENS <= input.max_tokens + 50: # 50 is reserved token length for prompt
+                if MAX_TOTAL_TOKENS <= input.max_tokens + 50:  # 50 is reserved token length for prompt
                     raise RuntimeError("Please set MAX_TOTAL_TOKENS larger than max_tokens + 50)")
-                max_input_tokens = min(
-                    MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS
-                )  
+                max_input_tokens = min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)
             chunk_size = min(input.chunk_size, max_input_tokens) if input.chunk_size > 0 else max_input_tokens
             chunk_overlap = input.chunk_overlap if input.chunk_overlap > 0 else int(0.1 * chunk_size)
             text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
@@ -156,7 +155,11 @@ async def generate(self, input: DocSumLLMParams, client):
             llm_chain = load_summarize_chain(llm=client, prompt=PROMPT)
         elif summary_type == "map_reduce":
             llm_chain = load_summarize_chain(
-                llm=client, map_prompt=PROMPT, combine_prompt=PROMPT, chain_type="map_reduce", return_intermediate_steps=True
+                llm=client,
+                map_prompt=PROMPT,
+                combine_prompt=PROMPT,
+                chain_type="map_reduce",
+                return_intermediate_steps=True,
             )
         elif summary_type == "refine":
             llm_chain = load_summarize_chain(
@@ -167,7 +170,7 @@ async def generate(self, input: DocSumLLMParams, client):
                 return_intermediate_steps=True,
             )
         else:
-            raise NotImplementedError(f'Please specify the summary_type in {summary_types}')
+            raise NotImplementedError(f"Please specify the summary_type in {summary_types}")
 
         if input.streaming:
 
@@ -199,6 +202,7 @@ async def stream_generator():
 
             return GeneratedDoc(text=output_text, prompt=input.query)
 
+
 class OPEADocSum_TGI(OPEADocSum):
     """A specialized OPEA DocSum TGI component derived from OPEADocSum for interacting with TGI services based on Lanchain HuggingFaceEndpoint API.
 
@@ -307,7 +311,7 @@ async def invoke(self, input: DocSumLLMParams):
             top_p=input.top_p,
             streaming=input.streaming,
             temperature=input.temperature,
-            presence_penalty=input.repetition_penalty
+            presence_penalty=input.repetition_penalty,
         )
         result = await self.generate(input, self.client)
 
diff --git a/comps/llms/src/doc-summarization/integrations/template.py b/comps/llms/src/doc-summarization/integrations/template.py
index 26596f55a0..20ef59454c 100644
--- a/comps/llms/src/doc-summarization/integrations/template.py
+++ b/comps/llms/src/doc-summarization/integrations/template.py
@@ -1,3 +1,6 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 templ_en = """Write a concise summary of the following:
 
 
@@ -52,4 +55,4 @@
 
 最终摘要:
 
-"""
\ No newline at end of file
+"""
diff --git a/tests/llms/test_llms_doc-summarization_tgi.sh b/tests/llms/test_llms_doc-summarization_tgi.sh
index fd677c1871..c6615b59e8 100644
--- a/tests/llms/test_llms_doc-summarization_tgi.sh
+++ b/tests/llms/test_llms_doc-summarization_tgi.sh
@@ -105,7 +105,7 @@ function validate_microservices() {
         "llm_summarization" \
         "llm-docsum-server" \
         '{"query":"2024年9月26日，北京——今日，英特尔正式发布英特尔® 至强® 6性能核处理器（代号Granite Rapids），为AI、数据分析、科学计算等计算密集型业务提供卓越性能。", "max_tokens":32, "language":"zh", "streaming":false}'
-       
+
     echo "Validate truncate mode..."
     validate_services \
         "$URL" \
diff --git a/tests/llms/test_llms_doc-summarization_tgi_on_intel_hpu.sh b/tests/llms/test_llms_doc-summarization_tgi_on_intel_hpu.sh
index 83519bda7a..b68afce2f2 100644
--- a/tests/llms/test_llms_doc-summarization_tgi_on_intel_hpu.sh
+++ b/tests/llms/test_llms_doc-summarization_tgi_on_intel_hpu.sh
@@ -105,7 +105,7 @@ function validate_microservices() {
         "llm_summarization" \
         "llm-docsum-server" \
         '{"query":"2024年9月26日，北京——今日，英特尔正式发布英特尔® 至强® 6性能核处理器（代号Granite Rapids），为AI、数据分析、科学计算等计算密集型业务提供卓越性能。", "max_tokens":32, "language":"zh", "streaming":false}'
-       
+
     echo "Validate truncate mode..."
     validate_services \
         "$URL" \
diff --git a/tests/llms/test_llms_doc-summarization_vllm.sh b/tests/llms/test_llms_doc-summarization_vllm.sh
index be9286051f..ba98f4be23 100644
--- a/tests/llms/test_llms_doc-summarization_vllm.sh
+++ b/tests/llms/test_llms_doc-summarization_vllm.sh
@@ -117,7 +117,7 @@ function validate_microservices() {
         "llm_summarization" \
         "llm-docsum-server" \
         '{"query":"2024年9月26日，北京——今日，英特尔正式发布英特尔® 至强® 6性能核处理器（代号Granite Rapids），为AI、数据分析、科学计算等计算密集型业务提供卓越性能。", "max_tokens":32, "language":"zh", "streaming":false}'
-       
+
     echo "Validate truncate mode..."
     validate_services \
         "$URL" \
diff --git a/tests/llms/test_llms_doc-summarization_vllm_on_intel_hpu.sh b/tests/llms/test_llms_doc-summarization_vllm_on_intel_hpu.sh
index acbde4b1c1..7b68b07d21 100644
--- a/tests/llms/test_llms_doc-summarization_vllm_on_intel_hpu.sh
+++ b/tests/llms/test_llms_doc-summarization_vllm_on_intel_hpu.sh
@@ -118,7 +118,7 @@ function validate_microservices() {
         "llm_summarization" \
         "llm-docsum-server" \
         '{"query":"2024年9月26日，北京——今日，英特尔正式发布英特尔® 至强® 6性能核处理器（代号Granite Rapids），为AI、数据分析、科学计算等计算密集型业务提供卓越性能。", "max_tokens":32, "language":"zh", "streaming":false}'
-       
+
     echo "Validate truncate mode..."
     validate_services \
         "$URL" \

From 75f2620586ca902aafaab8c1c306f61688955e74 Mon Sep 17 00:00:00 2001
From: Xinyao Wang <xinyao.wang@intel.com>
Date: Thu, 9 Jan 2025 19:58:23 +0800
Subject: [PATCH 08/23] align to registry

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>
---
 .../docker_compose/doc-summarization_tgi.yaml |  2 +-
 .../doc-summarization_tgi_on_intel_hpu.yaml   |  2 +-
 .../doc-summarization_vllm.yaml               |  2 +-
 .../doc-summarization_vllm_on_intel_hpu.yaml  |  2 +-
 comps/llms/src/doc-summarization/README.md    |  4 +-
 .../doc-summarization/integrations/opea.py    |  9 +++--
 .../opea_docsum_microservice.py               | 40 +++----------------
 tests/llms/test_llms_doc-summarization_tgi.sh |  2 +-
 ...llms_doc-summarization_tgi_on_intel_hpu.sh |  2 +-
 .../llms/test_llms_doc-summarization_vllm.sh  |  2 +-
 ...lms_doc-summarization_vllm_on_intel_hpu.sh |  2 +-
 11 files changed, 21 insertions(+), 48 deletions(-)

diff --git a/comps/llms/deployment/docker_compose/doc-summarization_tgi.yaml b/comps/llms/deployment/docker_compose/doc-summarization_tgi.yaml
index 9613c003f8..9a14e5e5c9 100644
--- a/comps/llms/deployment/docker_compose/doc-summarization_tgi.yaml
+++ b/comps/llms/deployment/docker_compose/doc-summarization_tgi.yaml
@@ -43,7 +43,7 @@ services:
       MAX_TOTAL_TOKENS: ${MAX_TOTAL_TOKENS}
       LLM_ENDPOINT: ${LLM_ENDPOINT}
       LLM_MODEL_ID: ${LLM_MODEL_ID}
-      LLM_BACKEND: ${LLM_BACKEND}
+      DocSum_COMPONENT_NAME: ${DocSum_COMPONENT_NAME}
       LOGFLAG: ${LOGFLAG:-False}
     restart: unless-stopped
 
diff --git a/comps/llms/deployment/docker_compose/doc-summarization_tgi_on_intel_hpu.yaml b/comps/llms/deployment/docker_compose/doc-summarization_tgi_on_intel_hpu.yaml
index 47de4b6739..1424884439 100644
--- a/comps/llms/deployment/docker_compose/doc-summarization_tgi_on_intel_hpu.yaml
+++ b/comps/llms/deployment/docker_compose/doc-summarization_tgi_on_intel_hpu.yaml
@@ -54,7 +54,7 @@ services:
       MAX_TOTAL_TOKENS: ${MAX_TOTAL_TOKENS}
       LLM_ENDPOINT: ${LLM_ENDPOINT}
       LLM_MODEL_ID: ${LLM_MODEL_ID}
-      LLM_BACKEND: ${LLM_BACKEND}
+      DocSum_COMPONENT_NAME: ${DocSum_COMPONENT_NAME}
       LOGFLAG: ${LOGFLAG:-False}
     restart: unless-stopped
 
diff --git a/comps/llms/deployment/docker_compose/doc-summarization_vllm.yaml b/comps/llms/deployment/docker_compose/doc-summarization_vllm.yaml
index ffae9597f8..d14da4d527 100644
--- a/comps/llms/deployment/docker_compose/doc-summarization_vllm.yaml
+++ b/comps/llms/deployment/docker_compose/doc-summarization_vllm.yaml
@@ -46,7 +46,7 @@ services:
       MAX_TOTAL_TOKENS: ${MAX_TOTAL_TOKENS}
       LLM_ENDPOINT: ${LLM_ENDPOINT}
       LLM_MODEL_ID: ${LLM_MODEL_ID}
-      LLM_BACKEND: ${LLM_BACKEND}
+      DocSum_COMPONENT_NAME: ${DocSum_COMPONENT_NAME}
       LOGFLAG: ${LOGFLAG:-False}
     restart: unless-stopped
 
diff --git a/comps/llms/deployment/docker_compose/doc-summarization_vllm_on_intel_hpu.yaml b/comps/llms/deployment/docker_compose/doc-summarization_vllm_on_intel_hpu.yaml
index e0040eaece..1a00b0d052 100644
--- a/comps/llms/deployment/docker_compose/doc-summarization_vllm_on_intel_hpu.yaml
+++ b/comps/llms/deployment/docker_compose/doc-summarization_vllm_on_intel_hpu.yaml
@@ -52,7 +52,7 @@ services:
       MAX_TOTAL_TOKENS: ${MAX_TOTAL_TOKENS}
       LLM_ENDPOINT: ${LLM_ENDPOINT}
       LLM_MODEL_ID: ${LLM_MODEL_ID}
-      LLM_BACKEND: ${LLM_BACKEND}
+      DocSum_COMPONENT_NAME: ${DocSum_COMPONENT_NAME}
       LOGFLAG: ${LOGFLAG:-False}
     restart: unless-stopped
 
diff --git a/comps/llms/src/doc-summarization/README.md b/comps/llms/src/doc-summarization/README.md
index d47ced77ff..a18e1a41d8 100644
--- a/comps/llms/src/doc-summarization/README.md
+++ b/comps/llms/src/doc-summarization/README.md
@@ -17,7 +17,7 @@ export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}"
 export LLM_MODEL_ID=${your_hf_llm_model}
 export MAX_INPUT_TOKENS=2048
 export MAX_TOTAL_TOKENS=4096
-export LLM_BACKEND="tgi" # or "vllm"
+export DocSum_COMPONENT_NAME="OPEADocSum_TGI" # or "OPEADocSum_vLLM"
 ```
 
 Please make sure MAX_TOTAL_TOKENS should be larger than (MAX_INPUT_TOKENS + max_new_tokens + 50), 50 is reserved prompt length.
@@ -63,7 +63,7 @@ docker run -d \
     -e LLM_MODEL_ID=$LLM_MODEL_ID \
     -e LLM_ENDPOINT=$LLM_ENDPOINT \
     -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN \
-    -e LLM_BACKEND=$LLM_BACKEND \
+    -e DocSum_COMPONENT_NAME=$DocSum_COMPONENT_NAME \
     -e MAX_INPUT_TOKENS=${MAX_INPUT_TOKENS} \
     -e MAX_TOTAL_TOKENS=${MAX_TOTAL_TOKENS} \
     opea/llm-docsum:latest
diff --git a/comps/llms/src/doc-summarization/integrations/opea.py b/comps/llms/src/doc-summarization/integrations/opea.py
index ec66658468..7e9b04c9a6 100644
--- a/comps/llms/src/doc-summarization/integrations/opea.py
+++ b/comps/llms/src/doc-summarization/integrations/opea.py
@@ -12,7 +12,7 @@
 from langchain_core.prompts import PromptTemplate
 from transformers import AutoTokenizer
 
-from comps import CustomLogger, DocSumLLMParams, GeneratedDoc, OpeaComponent, ServiceType
+from comps import CustomLogger, DocSumLLMParams, GeneratedDoc, OpeaComponent, OpeaComponentRegistry, ServiceType
 from comps.cores.mega.utils import ConfigError, get_access_token, load_model_configs
 
 from .template import templ_en, templ_refine_en, templ_refine_zh, templ_zh
@@ -72,6 +72,9 @@ def __init__(self, name: str, description: str, config: dict = None):
         )
         self.llm_endpoint = get_llm_endpoint()
         self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+        health_status = self.check_health()
+        if not health_status:
+            logger.error("OPEADocSum health check failed.")
 
     async def generate(self, input: DocSumLLMParams, client):
         """Invokes the TGI/vLLM LLM service to generate summarization for the provided input.
@@ -202,7 +205,7 @@ async def stream_generator():
 
             return GeneratedDoc(text=output_text, prompt=input.query)
 
-
+@OpeaComponentRegistry.register("OPEADocSum_TGI")
 class OPEADocSum_TGI(OPEADocSum):
     """A specialized OPEA DocSum TGI component derived from OPEADocSum for interacting with TGI services based on Lanchain HuggingFaceEndpoint API.
 
@@ -263,7 +266,7 @@ async def invoke(self, input: DocSumLLMParams):
 
         return result
 
-
+@OpeaComponentRegistry.register("OPEADocSum_vLLM")
 class OPEADocSum_vLLM(OPEADocSum):
     """A specialized OPEA DocSum vLLM component derived from OPEADocSum for interacting with vLLM services based on Lanchain VLLMOpenAI API.
 
diff --git a/comps/llms/src/doc-summarization/opea_docsum_microservice.py b/comps/llms/src/doc-summarization/opea_docsum_microservice.py
index 8b45b5f674..22bca39c2b 100644
--- a/comps/llms/src/doc-summarization/opea_docsum_microservice.py
+++ b/comps/llms/src/doc-summarization/opea_docsum_microservice.py
@@ -4,12 +4,10 @@
 import os
 import time
 
-from integrations.opea import OPEADocSum_TGI, OPEADocSum_vLLM
-
 from comps import (
     CustomLogger,
     DocSumLLMParams,
-    OpeaComponentController,
+    OpeaComponentLoader,
     ServiceType,
     opea_microservices,
     register_microservice,
@@ -20,37 +18,9 @@
 logger = CustomLogger("llm_docsum")
 logflag = os.getenv("LOGFLAG", False)
 
-llm_backend = os.getenv("LLM_BACKEND", "").lower()
-if logflag:
-    logger.info(f"LLM BACKEND: {llm_backend}")
-
-comps_name = {"tgi": "OPEADocSum_TGI", "vllm": "OPEADocSum_vLLM"}
-active_comps_name = comps_name[llm_backend] if llm_backend != "" else ""
-
-# Initialize OpeaComponentController
-controller = OpeaComponentController()
-
-# Register components
-try:
-    opea_docsum_tgi = OPEADocSum_TGI(
-        name=comps_name["tgi"],
-        description="OPEA DocSum Service",
-    )
-    # Register components with the controller
-    controller.register(opea_docsum_tgi)
-
-    opea_docsum_vllm = OPEADocSum_vLLM(
-        name=comps_name["vllm"],
-        description="OPEA DocSum Service",
-    )
-    # Register components with the controller
-    controller.register(opea_docsum_vllm)
-
-    # Discover and activate a healthy component
-    controller.discover_and_activate(active_comps_name)
-except Exception as e:
-    logger.error(f"Failed to initialize components: {e}")
-
+llm_component_name = os.getenv("DocSum_COMPONENT_NAME", "OPEADocSum_TGI")
+# Initialize OpeaComponentLoader
+loader = OpeaComponentLoader(llm_component_name, description=f"OPEA LLM DocSum Component: {llm_component_name}")
 
 @register_microservice(
     name="opea_service@llm_docsum",
@@ -69,7 +39,7 @@ async def llm_generate(input: DocSumLLMParams):
 
     try:
         # Use the controller to invoke the active component
-        response = await controller.invoke(input)
+        response = await loader.invoke(input)
         # Record statistics
         statistics_dict["opea_service@llm_docsum"].append_latency(time.time() - start, None)
         return response
diff --git a/tests/llms/test_llms_doc-summarization_tgi.sh b/tests/llms/test_llms_doc-summarization_tgi.sh
index c6615b59e8..fd6ca015c7 100644
--- a/tests/llms/test_llms_doc-summarization_tgi.sh
+++ b/tests/llms/test_llms_doc-summarization_tgi.sh
@@ -28,7 +28,7 @@ function start_service() {
     export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
     export MAX_INPUT_TOKENS=2048
     export MAX_TOTAL_TOKENS=4096
-    export LLM_BACKEND="tgi" # or "vllm"
+    export DocSum_COMPONENT_NAME="OPEADocSum_TGI" # or "vllm"
     export LOGFLAG=True
 
     cd $WORKPATH/comps/llms/deployment/docker_compose
diff --git a/tests/llms/test_llms_doc-summarization_tgi_on_intel_hpu.sh b/tests/llms/test_llms_doc-summarization_tgi_on_intel_hpu.sh
index b68afce2f2..1256876b21 100644
--- a/tests/llms/test_llms_doc-summarization_tgi_on_intel_hpu.sh
+++ b/tests/llms/test_llms_doc-summarization_tgi_on_intel_hpu.sh
@@ -28,7 +28,7 @@ function start_service() {
     export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
     export MAX_INPUT_TOKENS=2048
     export MAX_TOTAL_TOKENS=4096
-    export LLM_BACKEND="tgi" # or "vllm"
+    export DocSum_COMPONENT_NAME="OPEADocSum_TGI" # or "vllm"
     export LOGFLAG=True
 
     cd $WORKPATH/comps/llms/deployment/docker_compose
diff --git a/tests/llms/test_llms_doc-summarization_vllm.sh b/tests/llms/test_llms_doc-summarization_vllm.sh
index ba98f4be23..d87c984c25 100644
--- a/tests/llms/test_llms_doc-summarization_vllm.sh
+++ b/tests/llms/test_llms_doc-summarization_vllm.sh
@@ -39,7 +39,7 @@ function start_service() {
     export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
     export MAX_INPUT_TOKENS=2048
     export MAX_TOTAL_TOKENS=4096
-    export LLM_BACKEND="vllm" # or "vllm"
+    export DocSum_COMPONENT_NAME="OPEADocSum_vLLM" # or "vllm"
     export VLLM_SKIP_WARMUP=true
     export LOGFLAG=True
 
diff --git a/tests/llms/test_llms_doc-summarization_vllm_on_intel_hpu.sh b/tests/llms/test_llms_doc-summarization_vllm_on_intel_hpu.sh
index 7b68b07d21..df797adc3f 100644
--- a/tests/llms/test_llms_doc-summarization_vllm_on_intel_hpu.sh
+++ b/tests/llms/test_llms_doc-summarization_vllm_on_intel_hpu.sh
@@ -40,7 +40,7 @@ function start_service() {
     export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
     export MAX_INPUT_TOKENS=2048
     export MAX_TOTAL_TOKENS=4096
-    export LLM_BACKEND="vllm" # or "vllm"
+    export DocSum_COMPONENT_NAME="OPEADocSum_vLLM" # or "vllm"
     export VLLM_SKIP_WARMUP=true
     export LOGFLAG=True
 

From fbdc51b24a58e0d9cb0baed756478355ef0b9a1b Mon Sep 17 00:00:00 2001
From: Xinyao Wang <xinyao.wang@intel.com>
Date: Thu, 9 Jan 2025 20:00:33 +0800
Subject: [PATCH 09/23] fix streaming

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>
---
 .../src/doc-summarization/integrations/opea.py | 18 +++++++++---------
 tests/llms/test_llms_doc-summarization_tgi.sh  |  6 +++---
 ..._llms_doc-summarization_tgi_on_intel_hpu.sh |  6 +++---
 tests/llms/test_llms_doc-summarization_vllm.sh |  6 +++---
 ...llms_doc-summarization_vllm_on_intel_hpu.sh |  6 +++---
 5 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/comps/llms/src/doc-summarization/integrations/opea.py b/comps/llms/src/doc-summarization/integrations/opea.py
index 7e9b04c9a6..5276251eba 100644
--- a/comps/llms/src/doc-summarization/integrations/opea.py
+++ b/comps/llms/src/doc-summarization/integrations/opea.py
@@ -175,7 +175,7 @@ async def generate(self, input: DocSumLLMParams, client):
         else:
             raise NotImplementedError(f"Please specify the summary_type in {summary_types}")
 
-        if input.streaming:
+        if input.stream:
 
             async def stream_generator():
                 from langserve.serialization import WellKnownLCSerializer
@@ -248,9 +248,9 @@ async def invoke(self, input: DocSumLLMParams):
         if self.access_token:
             server_kwargs["headers"] = {"Authorization": f"Bearer {self.access_token}"}
 
-        if input.streaming and input.summary_type == "map_reduce":
-            logger.info("Map Reduce mode don't support streaming=True, set to streaming=False")
-            input.streaming = False
+        if input.stream and input.summary_type == "map_reduce":
+            logger.info("Map Reduce mode don't support stream=True, set to stream=False")
+            input.stream = False
         self.client = HuggingFaceEndpoint(
             endpoint_url=self.llm_endpoint,
             max_new_tokens=input.max_tokens,
@@ -259,7 +259,7 @@ async def invoke(self, input: DocSumLLMParams):
             typical_p=input.typical_p,
             temperature=input.temperature,
             repetition_penalty=input.repetition_penalty,
-            streaming=input.streaming,
+            streaming=input.stream,
             server_kwargs=server_kwargs,
         )
         result = await self.generate(input, self.client)
@@ -302,9 +302,9 @@ async def invoke(self, input: DocSumLLMParams):
         if self.access_token:
             headers = {"Authorization": f"Bearer {self.access_token}"}
 
-        if input.streaming and input.summary_type == "map_reduce":
-            logger.info("Map Reduce mode don't support streaming=True, set to streaming=False")
-            input.streaming = False
+        if input.stream and input.summary_type == "map_reduce":
+            logger.info("Map Reduce mode don't support stream=True, set to stream=False")
+            input.stream = False
         self.client = VLLMOpenAI(
             openai_api_key="EMPTY",
             openai_api_base=self.llm_endpoint + "/v1",
@@ -312,7 +312,7 @@ async def invoke(self, input: DocSumLLMParams):
             default_headers=headers,
             max_tokens=input.max_tokens,
             top_p=input.top_p,
-            streaming=input.streaming,
+            streaming=input.stream,
             temperature=input.temperature,
             presence_penalty=input.repetition_penalty,
         )
diff --git a/tests/llms/test_llms_doc-summarization_tgi.sh b/tests/llms/test_llms_doc-summarization_tgi.sh
index fd6ca015c7..d15e095f7a 100644
--- a/tests/llms/test_llms_doc-summarization_tgi.sh
+++ b/tests/llms/test_llms_doc-summarization_tgi.sh
@@ -96,7 +96,7 @@ function validate_microservices() {
         'text' \
         "llm_summarization" \
         "llm-docsum-server" \
-        '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "streaming":false}'
+        '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "stream":false}'
 
     echo "Validate Chinese mode..."
     validate_services \
@@ -104,7 +104,7 @@ function validate_microservices() {
         'text' \
         "llm_summarization" \
         "llm-docsum-server" \
-        '{"query":"2024年9月26日，北京——今日，英特尔正式发布英特尔® 至强® 6性能核处理器（代号Granite Rapids），为AI、数据分析、科学计算等计算密集型业务提供卓越性能。", "max_tokens":32, "language":"zh", "streaming":false}'
+        '{"query":"2024年9月26日，北京——今日，英特尔正式发布英特尔® 至强® 6性能核处理器（代号Granite Rapids），为AI、数据分析、科学计算等计算密集型业务提供卓越性能。", "max_tokens":32, "language":"zh", "stream":false}'
 
     echo "Validate truncate mode..."
     validate_services \
@@ -120,7 +120,7 @@ function validate_microservices() {
         'text' \
         "llm_summarization" \
         "llm-docsum-server" \
-        '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "streaming":false}'
+        '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false}'
 
     echo "Validate refine mode..."
     validate_services \
diff --git a/tests/llms/test_llms_doc-summarization_tgi_on_intel_hpu.sh b/tests/llms/test_llms_doc-summarization_tgi_on_intel_hpu.sh
index 1256876b21..a51f6de504 100644
--- a/tests/llms/test_llms_doc-summarization_tgi_on_intel_hpu.sh
+++ b/tests/llms/test_llms_doc-summarization_tgi_on_intel_hpu.sh
@@ -96,7 +96,7 @@ function validate_microservices() {
         'text' \
         "llm_summarization" \
         "llm-docsum-server" \
-        '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "streaming":false}'
+        '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "stream":false}'
 
     echo "Validate Chinese mode..."
     validate_services \
@@ -104,7 +104,7 @@ function validate_microservices() {
         'text' \
         "llm_summarization" \
         "llm-docsum-server" \
-        '{"query":"2024年9月26日，北京——今日，英特尔正式发布英特尔® 至强® 6性能核处理器（代号Granite Rapids），为AI、数据分析、科学计算等计算密集型业务提供卓越性能。", "max_tokens":32, "language":"zh", "streaming":false}'
+        '{"query":"2024年9月26日，北京——今日，英特尔正式发布英特尔® 至强® 6性能核处理器（代号Granite Rapids），为AI、数据分析、科学计算等计算密集型业务提供卓越性能。", "max_tokens":32, "language":"zh", "stream":false}'
 
     echo "Validate truncate mode..."
     validate_services \
@@ -120,7 +120,7 @@ function validate_microservices() {
         'text' \
         "llm_summarization" \
         "llm-docsum-server" \
-        '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "streaming":false}'
+        '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false}'
 
     echo "Validate refine mode..."
     validate_services \
diff --git a/tests/llms/test_llms_doc-summarization_vllm.sh b/tests/llms/test_llms_doc-summarization_vllm.sh
index d87c984c25..50ad84e001 100644
--- a/tests/llms/test_llms_doc-summarization_vllm.sh
+++ b/tests/llms/test_llms_doc-summarization_vllm.sh
@@ -108,7 +108,7 @@ function validate_microservices() {
         'text' \
         "llm_summarization" \
         "llm-docsum-server" \
-        '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "streaming":false}'
+        '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "stream":false}'
 
     echo "Validate Chinese mode..."
     validate_services \
@@ -116,7 +116,7 @@ function validate_microservices() {
         'text' \
         "llm_summarization" \
         "llm-docsum-server" \
-        '{"query":"2024年9月26日，北京——今日，英特尔正式发布英特尔® 至强® 6性能核处理器（代号Granite Rapids），为AI、数据分析、科学计算等计算密集型业务提供卓越性能。", "max_tokens":32, "language":"zh", "streaming":false}'
+        '{"query":"2024年9月26日，北京——今日，英特尔正式发布英特尔® 至强® 6性能核处理器（代号Granite Rapids），为AI、数据分析、科学计算等计算密集型业务提供卓越性能。", "max_tokens":32, "language":"zh", "stream":false}'
 
     echo "Validate truncate mode..."
     validate_services \
@@ -132,7 +132,7 @@ function validate_microservices() {
         'text' \
         "llm_summarization" \
         "llm-docsum-server" \
-        '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "streaming":false}'
+        '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false}'
 
     echo "Validate refine mode..."
     validate_services \
diff --git a/tests/llms/test_llms_doc-summarization_vllm_on_intel_hpu.sh b/tests/llms/test_llms_doc-summarization_vllm_on_intel_hpu.sh
index df797adc3f..a91127c742 100644
--- a/tests/llms/test_llms_doc-summarization_vllm_on_intel_hpu.sh
+++ b/tests/llms/test_llms_doc-summarization_vllm_on_intel_hpu.sh
@@ -109,7 +109,7 @@ function validate_microservices() {
         'text' \
         "llm_summarization" \
         "llm-docsum-server" \
-        '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "streaming":false}'
+        '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "stream":false}'
 
     echo "Validate Chinese mode..."
     validate_services \
@@ -117,7 +117,7 @@ function validate_microservices() {
         'text' \
         "llm_summarization" \
         "llm-docsum-server" \
-        '{"query":"2024年9月26日，北京——今日，英特尔正式发布英特尔® 至强® 6性能核处理器（代号Granite Rapids），为AI、数据分析、科学计算等计算密集型业务提供卓越性能。", "max_tokens":32, "language":"zh", "streaming":false}'
+        '{"query":"2024年9月26日，北京——今日，英特尔正式发布英特尔® 至强® 6性能核处理器（代号Granite Rapids），为AI、数据分析、科学计算等计算密集型业务提供卓越性能。", "max_tokens":32, "language":"zh", "stream":false}'
 
     echo "Validate truncate mode..."
     validate_services \
@@ -133,7 +133,7 @@ function validate_microservices() {
         'text' \
         "llm_summarization" \
         "llm-docsum-server" \
-        '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "streaming":false}'
+        '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false}'
 
     echo "Validate refine mode..."
     validate_services \

From 9d869c9b042de9474048aa19d9f6874e86c47ecc Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 9 Jan 2025 12:47:55 +0000
Subject: [PATCH 10/23] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 comps/llms/src/doc-summarization/integrations/opea.py        | 2 ++
 comps/llms/src/doc-summarization/opea_docsum_microservice.py | 1 +
 2 files changed, 3 insertions(+)

diff --git a/comps/llms/src/doc-summarization/integrations/opea.py b/comps/llms/src/doc-summarization/integrations/opea.py
index 5276251eba..3decf1d45e 100644
--- a/comps/llms/src/doc-summarization/integrations/opea.py
+++ b/comps/llms/src/doc-summarization/integrations/opea.py
@@ -205,6 +205,7 @@ async def stream_generator():
 
             return GeneratedDoc(text=output_text, prompt=input.query)
 
+
 @OpeaComponentRegistry.register("OPEADocSum_TGI")
 class OPEADocSum_TGI(OPEADocSum):
     """A specialized OPEA DocSum TGI component derived from OPEADocSum for interacting with TGI services based on Lanchain HuggingFaceEndpoint API.
@@ -266,6 +267,7 @@ async def invoke(self, input: DocSumLLMParams):
 
         return result
 
+
 @OpeaComponentRegistry.register("OPEADocSum_vLLM")
 class OPEADocSum_vLLM(OPEADocSum):
     """A specialized OPEA DocSum vLLM component derived from OPEADocSum for interacting with vLLM services based on Lanchain VLLMOpenAI API.
diff --git a/comps/llms/src/doc-summarization/opea_docsum_microservice.py b/comps/llms/src/doc-summarization/opea_docsum_microservice.py
index 22bca39c2b..87861559f5 100644
--- a/comps/llms/src/doc-summarization/opea_docsum_microservice.py
+++ b/comps/llms/src/doc-summarization/opea_docsum_microservice.py
@@ -22,6 +22,7 @@
 # Initialize OpeaComponentLoader
 loader = OpeaComponentLoader(llm_component_name, description=f"OPEA LLM DocSum Component: {llm_component_name}")
 
+
 @register_microservice(
     name="opea_service@llm_docsum",
     service_type=ServiceType.LLM,

From d95e8f39f1cbb76e015fbbf11021c894bbf5cf3a Mon Sep 17 00:00:00 2001
From: Xinyao Wang <xinyao.wang@intel.com>
Date: Fri, 10 Jan 2025 13:33:31 +0800
Subject: [PATCH 11/23] fix bug

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>
---
 comps/llms/src/doc-summarization/opea_docsum_microservice.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/comps/llms/src/doc-summarization/opea_docsum_microservice.py b/comps/llms/src/doc-summarization/opea_docsum_microservice.py
index 87861559f5..2db2f92db8 100644
--- a/comps/llms/src/doc-summarization/opea_docsum_microservice.py
+++ b/comps/llms/src/doc-summarization/opea_docsum_microservice.py
@@ -4,6 +4,8 @@
 import os
 import time
 
+from integrations.opea import OPEADocSum_TGI, OPEADocSum_vLLM
+
 from comps import (
     CustomLogger,
     DocSumLLMParams,

From cd6d2b2574494a28176db0afd924f8e972dd4cc6 Mon Sep 17 00:00:00 2001
From: Xinyao Wang <xinyao.wang@intel.com>
Date: Fri, 10 Jan 2025 13:38:16 +0800
Subject: [PATCH 12/23] rename

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>
---
 .../doc-summarization/integrations/{opea.py => langchain.py}    | 0
 comps/llms/src/doc-summarization/opea_docsum_microservice.py    | 2 +-
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename comps/llms/src/doc-summarization/integrations/{opea.py => langchain.py} (100%)

diff --git a/comps/llms/src/doc-summarization/integrations/opea.py b/comps/llms/src/doc-summarization/integrations/langchain.py
similarity index 100%
rename from comps/llms/src/doc-summarization/integrations/opea.py
rename to comps/llms/src/doc-summarization/integrations/langchain.py
diff --git a/comps/llms/src/doc-summarization/opea_docsum_microservice.py b/comps/llms/src/doc-summarization/opea_docsum_microservice.py
index 2db2f92db8..b7519cacf7 100644
--- a/comps/llms/src/doc-summarization/opea_docsum_microservice.py
+++ b/comps/llms/src/doc-summarization/opea_docsum_microservice.py
@@ -4,7 +4,7 @@
 import os
 import time
 
-from integrations.opea import OPEADocSum_TGI, OPEADocSum_vLLM
+from integrations.langchain import OPEADocSum_TGI, OPEADocSum_vLLM
 
 from comps import (
     CustomLogger,

From fb914f03ced9674e60cf289ad1ed3170ce74b2ba Mon Sep 17 00:00:00 2001
From: Xinyao Wang <xinyao.wang@intel.com>
Date: Fri, 10 Jan 2025 16:24:21 +0800
Subject: [PATCH 13/23] fix bug

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>
---
 comps/llms/src/doc-summarization/README.md                  | 6 +++---
 ..._tgi.sh => test_llms_doc-summarization_langchain_tgi.sh} | 0
 ...st_llms_doc-summarization_langchain_tgi_on_intel_hpu.sh} | 0
 ...llm.sh => test_llms_doc-summarization_langchain_vllm.sh} | 0
 ...t_llms_doc-summarization_langchain_vllm_on_intel_hpu.sh} | 0
 5 files changed, 3 insertions(+), 3 deletions(-)
 rename tests/llms/{test_llms_doc-summarization_tgi.sh => test_llms_doc-summarization_langchain_tgi.sh} (100%)
 rename tests/llms/{test_llms_doc-summarization_tgi_on_intel_hpu.sh => test_llms_doc-summarization_langchain_tgi_on_intel_hpu.sh} (100%)
 rename tests/llms/{test_llms_doc-summarization_vllm.sh => test_llms_doc-summarization_langchain_vllm.sh} (100%)
 rename tests/llms/{test_llms_doc-summarization_vllm_on_intel_hpu.sh => test_llms_doc-summarization_langchain_vllm_on_intel_hpu.sh} (100%)

diff --git a/comps/llms/src/doc-summarization/README.md b/comps/llms/src/doc-summarization/README.md
index a18e1a41d8..5b5cc1c598 100644
--- a/comps/llms/src/doc-summarization/README.md
+++ b/comps/llms/src/doc-summarization/README.md
@@ -1,6 +1,6 @@
 # Document Summary LLM Microservice
 
-This microservice leverages LangChain to implement summarization strategies and facilitate LLM inference using Text Generation Inference on Intel Xeon and Gaudi2 processors. You can set backend service either [TGI](../../../3rd_parties/tgi) or [vLLM](../../../3rd_parties/vllm).
+This microservice leverages LangChain to implement summarization strategies and facilitate LLM inference using Text Generation Inference on Intel Xeon and Gaudi2 processors. You can set backend service either [TGI](../../../third_parties/tgi) or [vLLM](../../../third_parties/vllm).
 
 ## 🚀1. Start Microservice with Docker 🐳
 
@@ -26,7 +26,7 @@ Please make sure MAX_TOTAL_TOKENS should be larger than (MAX_INPUT_TOKENS + max_
 
 Step 1: Prepare backend LLM docker image.
 
-If you want to use vLLM backend, refer to [vLLM](../../../3rd_parties/vllm/src) to build vLLM docker images first.
+If you want to use vLLM backend, refer to [vLLM](../../../third_parties/vllm/src) to build vLLM docker images first.
 
 No need for TGI.
 
@@ -49,7 +49,7 @@ You can choose one as needed.
 ### 1.3.1 Run Docker with CLI (Option A)
 
 Step 1: Start the backend LLM service
-Please refer to [TGI](../../../3rd_parties/tgi/deployment/docker_compose/) or [vLLM](<(../../../3rd_parties/vllm/deployment/docker_compose/)>) guideline to start a backend LLM service.
+Please refer to [TGI](../../../third_parties/tgi/deployment/docker_compose/) or [vLLM](<(../../../third_parties/vllm/deployment/docker_compose/)>) guideline to start a backend LLM service.
 
 Step 2: Start the DocSum microservices
 
diff --git a/tests/llms/test_llms_doc-summarization_tgi.sh b/tests/llms/test_llms_doc-summarization_langchain_tgi.sh
similarity index 100%
rename from tests/llms/test_llms_doc-summarization_tgi.sh
rename to tests/llms/test_llms_doc-summarization_langchain_tgi.sh
diff --git a/tests/llms/test_llms_doc-summarization_tgi_on_intel_hpu.sh b/tests/llms/test_llms_doc-summarization_langchain_tgi_on_intel_hpu.sh
similarity index 100%
rename from tests/llms/test_llms_doc-summarization_tgi_on_intel_hpu.sh
rename to tests/llms/test_llms_doc-summarization_langchain_tgi_on_intel_hpu.sh
diff --git a/tests/llms/test_llms_doc-summarization_vllm.sh b/tests/llms/test_llms_doc-summarization_langchain_vllm.sh
similarity index 100%
rename from tests/llms/test_llms_doc-summarization_vllm.sh
rename to tests/llms/test_llms_doc-summarization_langchain_vllm.sh
diff --git a/tests/llms/test_llms_doc-summarization_vllm_on_intel_hpu.sh b/tests/llms/test_llms_doc-summarization_langchain_vllm_on_intel_hpu.sh
similarity index 100%
rename from tests/llms/test_llms_doc-summarization_vllm_on_intel_hpu.sh
rename to tests/llms/test_llms_doc-summarization_langchain_vllm_on_intel_hpu.sh

From 6b829f52fa545010ca277492cbb354cebab49502 Mon Sep 17 00:00:00 2001
From: Xinyao Wang <xinyao.wang@intel.com>
Date: Fri, 10 Jan 2025 21:19:38 +0800
Subject: [PATCH 14/23] fix port bug

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>
---
 tests/llms/test_llms_doc-summarization_langchain_tgi.sh          | 1 -
 .../test_llms_doc-summarization_langchain_tgi_on_intel_hpu.sh    | 1 -
 tests/llms/test_llms_doc-summarization_langchain_vllm.sh         | 1 -
 .../test_llms_doc-summarization_langchain_vllm_on_intel_hpu.sh   | 1 -
 4 files changed, 4 deletions(-)

diff --git a/tests/llms/test_llms_doc-summarization_langchain_tgi.sh b/tests/llms/test_llms_doc-summarization_langchain_tgi.sh
index d15e095f7a..155e545854 100644
--- a/tests/llms/test_llms_doc-summarization_langchain_tgi.sh
+++ b/tests/llms/test_llms_doc-summarization_langchain_tgi.sh
@@ -71,7 +71,6 @@ function validate_services() {
 }
 
 function validate_microservices() {
-    DOCSUM_PORT=5076
     URL="http://${host_ip}:$DOCSUM_PORT/v1/chat/docsum"
 
     echo "Validate tgi..."
diff --git a/tests/llms/test_llms_doc-summarization_langchain_tgi_on_intel_hpu.sh b/tests/llms/test_llms_doc-summarization_langchain_tgi_on_intel_hpu.sh
index a51f6de504..ca1884fa2e 100644
--- a/tests/llms/test_llms_doc-summarization_langchain_tgi_on_intel_hpu.sh
+++ b/tests/llms/test_llms_doc-summarization_langchain_tgi_on_intel_hpu.sh
@@ -71,7 +71,6 @@ function validate_services() {
 }
 
 function validate_microservices() {
-    DOCSUM_PORT=5076
     URL="http://${host_ip}:$DOCSUM_PORT/v1/chat/docsum"
 
     echo "Validate tgi..."
diff --git a/tests/llms/test_llms_doc-summarization_langchain_vllm.sh b/tests/llms/test_llms_doc-summarization_langchain_vllm.sh
index 50ad84e001..ae414c1507 100644
--- a/tests/llms/test_llms_doc-summarization_langchain_vllm.sh
+++ b/tests/llms/test_llms_doc-summarization_langchain_vllm.sh
@@ -83,7 +83,6 @@ function validate_services() {
 }
 
 function validate_microservices() {
-    DOCSUM_PORT=5076
     URL="http://${host_ip}:$DOCSUM_PORT/v1/chat/docsum"
 
     echo "Validate vllm..."
diff --git a/tests/llms/test_llms_doc-summarization_langchain_vllm_on_intel_hpu.sh b/tests/llms/test_llms_doc-summarization_langchain_vllm_on_intel_hpu.sh
index a91127c742..4167593da6 100644
--- a/tests/llms/test_llms_doc-summarization_langchain_vllm_on_intel_hpu.sh
+++ b/tests/llms/test_llms_doc-summarization_langchain_vllm_on_intel_hpu.sh
@@ -84,7 +84,6 @@ function validate_services() {
 }
 
 function validate_microservices() {
-    DOCSUM_PORT=5076
     URL="http://${host_ip}:$DOCSUM_PORT/v1/chat/docsum"
 
     echo "Validate vllm..."

From c30df658f679e7e1fa7d6c9d69c889a7a8b7f0ef Mon Sep 17 00:00:00 2001
From: Xinyao Wang <xinyao.wang@intel.com>
Date: Fri, 10 Jan 2025 21:30:45 +0800
Subject: [PATCH 15/23] algin docsum endpoint

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>
---
 comps/llms/src/doc-summarization/README.md           | 12 ++++++------
 .../test_llms_doc-summarization_langchain_tgi.sh     |  2 +-
 ...s_doc-summarization_langchain_tgi_on_intel_hpu.sh |  2 +-
 .../test_llms_doc-summarization_langchain_vllm.sh    |  2 +-
 ..._doc-summarization_langchain_vllm_on_intel_hpu.sh |  2 +-
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/comps/llms/src/doc-summarization/README.md b/comps/llms/src/doc-summarization/README.md
index 5b5cc1c598..1d3b210318 100644
--- a/comps/llms/src/doc-summarization/README.md
+++ b/comps/llms/src/doc-summarization/README.md
@@ -113,19 +113,19 @@ If you want to deal with long context, can select suitable summary type, details
 
 ```bash
 # Enable stream to receive a stream response. By default, this is set to True.
-curl http://${your_ip}:9000/v1/chat/docsum \
+curl http://${your_ip}:9000/v1/docsum \
   -X POST \
   -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en"}' \
   -H 'Content-Type: application/json'
 
 # Disable stream to receive a non-stream response.
-curl http://${your_ip}:9000/v1/chat/docsum \
+curl http://${your_ip}:9000/v1/docsum \
   -X POST \
   -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "stream":false}' \
   -H 'Content-Type: application/json'
 
 # Use Chinese mode
-curl http://${your_ip}:9000/v1/chat/docsum \
+curl http://${your_ip}:9000/v1/docsum \
   -X POST \
   -d '{"query":"2024年9月26日，北京——今日，英特尔正式发布英特尔® 至强® 6性能核处理器（代号Granite Rapids），为AI、数据分析、科学计算等计算密集型业务提供卓越性能。", "max_tokens":32, "language":"zh", "stream":false}' \
   -H 'Content-Type: application/json'
@@ -146,7 +146,7 @@ In this mode LLM generate summary based on complete input text. In this case ple
 Truncate mode will truncate the input text and keep only the first chunk, whose length is equal to `min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)`
 
 ```bash
-curl http://${your_ip}:9000/v1/chat/docsum \
+curl http://${your_ip}:9000/v1/docsum \
   -X POST \
   -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "truncate", "chunk_size": 2000}' \
   -H 'Content-Type: application/json'
@@ -159,7 +159,7 @@ Map_reduce mode will split the inputs into multiple chunks, map each document to
 In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)`
 
 ```bash
-curl http://${your_ip}:9000/v1/chat/docsum \
+curl http://${your_ip}:9000/v1/docsum \
   -X POST \
   -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false}' \
   -H 'Content-Type: application/json'
@@ -172,7 +172,7 @@ Refin mode will split the inputs into multiple chunks, generate summary for the
 In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - 2 * input.max_tokens - 128, MAX_INPUT_TOKENS)`.
 
 ```bash
-curl http://${your_ip}:9000/v1/chat/docsum \
+curl http://${your_ip}:9000/v1/docsum \
   -X POST \
   -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000}' \
   -H 'Content-Type: application/json'
diff --git a/tests/llms/test_llms_doc-summarization_langchain_tgi.sh b/tests/llms/test_llms_doc-summarization_langchain_tgi.sh
index 155e545854..bbf9cd989a 100644
--- a/tests/llms/test_llms_doc-summarization_langchain_tgi.sh
+++ b/tests/llms/test_llms_doc-summarization_langchain_tgi.sh
@@ -71,7 +71,7 @@ function validate_services() {
 }
 
 function validate_microservices() {
-    URL="http://${host_ip}:$DOCSUM_PORT/v1/chat/docsum"
+    URL="http://${host_ip}:$DOCSUM_PORT/v1/docsum"
 
     echo "Validate tgi..."
     validate_services \
diff --git a/tests/llms/test_llms_doc-summarization_langchain_tgi_on_intel_hpu.sh b/tests/llms/test_llms_doc-summarization_langchain_tgi_on_intel_hpu.sh
index ca1884fa2e..df67f70460 100644
--- a/tests/llms/test_llms_doc-summarization_langchain_tgi_on_intel_hpu.sh
+++ b/tests/llms/test_llms_doc-summarization_langchain_tgi_on_intel_hpu.sh
@@ -71,7 +71,7 @@ function validate_services() {
 }
 
 function validate_microservices() {
-    URL="http://${host_ip}:$DOCSUM_PORT/v1/chat/docsum"
+    URL="http://${host_ip}:$DOCSUM_PORT/v1/docsum"
 
     echo "Validate tgi..."
     validate_services \
diff --git a/tests/llms/test_llms_doc-summarization_langchain_vllm.sh b/tests/llms/test_llms_doc-summarization_langchain_vllm.sh
index ae414c1507..b32060505a 100644
--- a/tests/llms/test_llms_doc-summarization_langchain_vllm.sh
+++ b/tests/llms/test_llms_doc-summarization_langchain_vllm.sh
@@ -83,7 +83,7 @@ function validate_services() {
 }
 
 function validate_microservices() {
-    URL="http://${host_ip}:$DOCSUM_PORT/v1/chat/docsum"
+    URL="http://${host_ip}:$DOCSUM_PORT/v1/docsum"
 
     echo "Validate vllm..."
     validate_services \
diff --git a/tests/llms/test_llms_doc-summarization_langchain_vllm_on_intel_hpu.sh b/tests/llms/test_llms_doc-summarization_langchain_vllm_on_intel_hpu.sh
index 4167593da6..ad9d72d0f0 100644
--- a/tests/llms/test_llms_doc-summarization_langchain_vllm_on_intel_hpu.sh
+++ b/tests/llms/test_llms_doc-summarization_langchain_vllm_on_intel_hpu.sh
@@ -84,7 +84,7 @@ function validate_services() {
 }
 
 function validate_microservices() {
-    URL="http://${host_ip}:$DOCSUM_PORT/v1/chat/docsum"
+    URL="http://${host_ip}:$DOCSUM_PORT/v1/docsum"
 
     echo "Validate vllm..."
     validate_services \

From be99d46ce8e9828ca0bf775481cc9be31eddd655 Mon Sep 17 00:00:00 2001
From: Xinyao Wang <xinyao.wang@intel.com>
Date: Sat, 11 Jan 2025 15:13:15 +0800
Subject: [PATCH 16/23] rm vllm ut, too slow

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>
---
 ...t_llms_doc-summarization_langchain_vllm.sh | 164 ------------------
 1 file changed, 164 deletions(-)
 delete mode 100644 tests/llms/test_llms_doc-summarization_langchain_vllm.sh

diff --git a/tests/llms/test_llms_doc-summarization_langchain_vllm.sh b/tests/llms/test_llms_doc-summarization_langchain_vllm.sh
deleted file mode 100644
index b32060505a..0000000000
--- a/tests/llms/test_llms_doc-summarization_langchain_vllm.sh
+++ /dev/null
@@ -1,164 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-
-WORKPATH=$(dirname "$PWD")
-host_ip=$(hostname -I | awk '{print $1}')
-LOG_PATH="$WORKPATH/tests"
-
-function build_docker_images() {
-    cd $WORKPATH
-    git clone https://github.com/vllm-project/vllm.git
-    cd ./vllm/
-    docker build -f Dockerfile.cpu -t opea/vllm:latest --shm-size=128g --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy .
-    if [ $? -ne 0 ]; then
-        echo "opea/vllm built fail"
-        exit 1
-    else
-        echo "opea/vllm built successful"
-    fi
-
-    cd $WORKPATH
-    docker build --no-cache -t opea/llm-docsum:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/doc-summarization/Dockerfile .
-    if [ $? -ne 0 ]; then
-        echo "opea/llm-docsum built fail"
-        exit 1
-    else
-        echo "opea/llm-docsum built successful"
-    fi
-}
-
-function start_service() {
-    export host_ip=${host_ip}
-    export LLM_ENDPOINT_PORT=5074
-    export DOCSUM_PORT=5075
-    export HUGGINGFACEHUB_API_TOKEN=${HF_TOKEN}
-    export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}"
-    export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
-    export MAX_INPUT_TOKENS=2048
-    export MAX_TOTAL_TOKENS=4096
-    export DocSum_COMPONENT_NAME="OPEADocSum_vLLM" # or "vllm"
-    export VLLM_SKIP_WARMUP=true
-    export LOGFLAG=True
-
-    cd $WORKPATH/comps/llms/deployment/docker_compose
-    docker compose -f doc-summarization_vllm.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
-
-    sleep 30s
-}
-
-function validate_services() {
-    local URL="$1"
-    local EXPECTED_RESULT="$2"
-    local SERVICE_NAME="$3"
-    local DOCKER_NAME="$4"
-    local INPUT_DATA="$5"
-
-    local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
-
-    echo "==========================================="
-
-    if [ "$HTTP_STATUS" -eq 200 ]; then
-        echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
-
-        local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
-
-        echo $CONTENT
-
-        if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
-            echo "[ $SERVICE_NAME ] Content is as expected."
-        else
-            echo "[ $SERVICE_NAME ] Content does not match the expected result"
-            docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
-            exit 1
-        fi
-    else
-        echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
-        docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
-        exit 1
-    fi
-    sleep 1s
-}
-
-function validate_microservices() {
-    URL="http://${host_ip}:$DOCSUM_PORT/v1/docsum"
-
-    echo "Validate vllm..."
-    validate_services \
-        "${LLM_ENDPOINT}/v1/completions" \
-        "text" \
-        "vllm" \
-        "vllm-server" \
-        '{"model": "Intel/neural-chat-7b-v3-3", "prompt": "What is Deep Learning?", "max_tokens": 32, "temperature": 0}'
-
-    echo "Validate stream=True..."
-    validate_services \
-        "$URL" \
-        'text' \
-        "llm_summarization" \
-        "llm-docsum-server" \
-        '{"query": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en"}'
-
-    echo "Validate stream=False..."
-    validate_services \
-        "$URL" \
-        'text' \
-        "llm_summarization" \
-        "llm-docsum-server" \
-        '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "stream":false}'
-
-    echo "Validate Chinese mode..."
-    validate_services \
-        "$URL" \
-        'text' \
-        "llm_summarization" \
-        "llm-docsum-server" \
-        '{"query":"2024年9月26日，北京——今日，英特尔正式发布英特尔® 至强® 6性能核处理器（代号Granite Rapids），为AI、数据分析、科学计算等计算密集型业务提供卓越性能。", "max_tokens":32, "language":"zh", "stream":false}'
-
-    echo "Validate truncate mode..."
-    validate_services \
-        "$URL" \
-        'text' \
-        "llm_summarization" \
-        "llm-docsum-server" \
-        '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "truncate", "chunk_size": 2000}'
-
-    echo "Validate map_reduce mode..."
-    validate_services \
-        "$URL" \
-        'text' \
-        "llm_summarization" \
-        "llm-docsum-server" \
-        '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false}'
-
-    echo "Validate refine mode..."
-    validate_services \
-        "$URL" \
-        'text' \
-        "llm_summarization" \
-        "llm-docsum-server" \
-        '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000}'
-}
-
-function stop_docker() {
-    cd $WORKPATH/comps/llms/deployment/docker_compose
-    docker compose -f doc-summarization_vllm.yaml down
-}
-
-function main() {
-
-    stop_docker
-
-    build_docker_images
-    start_service
-
-    validate_microservices
-
-    stop_docker
-    echo y | docker system prune
-
-}
-
-main

From 8fa84047b8d922c6e04798fbed2160fb280e49bc Mon Sep 17 00:00:00 2001
From: Xinyao Wang <xinyao.wang@intel.com>
Date: Sat, 11 Jan 2025 15:46:47 +0800
Subject: [PATCH 17/23] refine code

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>
---
 .../integrations/langchain.py                 | 28 +++++++++----------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/comps/llms/src/doc-summarization/integrations/langchain.py b/comps/llms/src/doc-summarization/integrations/langchain.py
index 3decf1d45e..dcf55ff67a 100644
--- a/comps/llms/src/doc-summarization/integrations/langchain.py
+++ b/comps/llms/src/doc-summarization/integrations/langchain.py
@@ -38,24 +38,22 @@
 else:
     DEFAULT_ENDPOINT = "http://localhost:8080"
 
-# Validate and Load the models config if MODEL_CONFIGS is not null
-configs_map = {}
-if MODEL_CONFIGS:
-    try:
-        configs_map = load_model_configs(MODEL_CONFIGS)
-    except ConfigError as e:
-        logger.error(f"Failed to load model configurations: {e}")
-        raise ConfigError(f"Failed to load model configurations: {e}")
-
-
 def get_llm_endpoint():
     if not MODEL_CONFIGS:
         return DEFAULT_ENDPOINT
-    try:
-        return configs_map.get(MODEL_NAME).get("endpoint")
-    except ConfigError as e:
-        logger.error(f"Input model {MODEL_NAME} not present in model_configs. Error {e}")
-        raise ConfigError(f"Input model {MODEL_NAME} not present in model_configs")
+    else:
+        # Validate and Load the models config if MODEL_CONFIGS is not null
+        configs_map = {}
+        try:
+            configs_map = load_model_configs(MODEL_CONFIGS)
+        except ConfigError as e:
+            logger.error(f"Failed to load model configurations: {e}")
+            raise ConfigError(f"Failed to load model configurations: {e}")
+        try:
+            return configs_map.get(MODEL_NAME).get("endpoint")
+        except ConfigError as e:
+            logger.error(f"Input model {MODEL_NAME} not present in model_configs. Error {e}")
+            raise ConfigError(f"Input model {MODEL_NAME} not present in model_configs")
 
 
 class OPEADocSum(OpeaComponent):

From 18e2ee0ce61194a5378ac751c75beeb4d69a5650 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 11 Jan 2025 08:35:31 +0000
Subject: [PATCH 18/23] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 comps/llms/src/doc-summarization/integrations/langchain.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/comps/llms/src/doc-summarization/integrations/langchain.py b/comps/llms/src/doc-summarization/integrations/langchain.py
index dcf55ff67a..3bc4df6341 100644
--- a/comps/llms/src/doc-summarization/integrations/langchain.py
+++ b/comps/llms/src/doc-summarization/integrations/langchain.py
@@ -38,6 +38,7 @@
 else:
     DEFAULT_ENDPOINT = "http://localhost:8080"
 
+
 def get_llm_endpoint():
     if not MODEL_CONFIGS:
         return DEFAULT_ENDPOINT

From e548779f0bda10cb0a258f3f50739f2c51d25fd3 Mon Sep 17 00:00:00 2001
From: Xinyao Wang <xinyao.wang@intel.com>
Date: Mon, 13 Jan 2025 09:49:52 +0800
Subject: [PATCH 19/23] rename

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>
---
 .../integrations/{langchain.py => common.py}  | 122 +-----------------
 .../src/doc-summarization/integrations/tgi.py |  74 +++++++++++
 .../doc-summarization/integrations/vllm.py    |  66 ++++++++++
 .../opea_docsum_microservice.py               |   3 +-
 4 files changed, 144 insertions(+), 121 deletions(-)
 rename comps/llms/src/doc-summarization/integrations/{langchain.py => common.py} (67%)
 create mode 100644 comps/llms/src/doc-summarization/integrations/tgi.py
 create mode 100644 comps/llms/src/doc-summarization/integrations/vllm.py

diff --git a/comps/llms/src/doc-summarization/integrations/langchain.py b/comps/llms/src/doc-summarization/integrations/common.py
similarity index 67%
rename from comps/llms/src/doc-summarization/integrations/langchain.py
rename to comps/llms/src/doc-summarization/integrations/common.py
index dcf55ff67a..4ab0ec5612 100644
--- a/comps/llms/src/doc-summarization/integrations/langchain.py
+++ b/comps/llms/src/doc-summarization/integrations/common.py
@@ -8,11 +8,10 @@
 from langchain.chains.summarize import load_summarize_chain
 from langchain.docstore.document import Document
 from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
-from langchain_community.llms import HuggingFaceEndpoint, VLLMOpenAI
 from langchain_core.prompts import PromptTemplate
 from transformers import AutoTokenizer
 
-from comps import CustomLogger, DocSumLLMParams, GeneratedDoc, OpeaComponent, OpeaComponentRegistry, ServiceType
+from comps import CustomLogger, DocSumLLMParams, GeneratedDoc, OpeaComponent, ServiceType
 from comps.cores.mega.utils import ConfigError, get_access_token, load_model_configs
 
 from .template import templ_en, templ_refine_en, templ_refine_zh, templ_zh
@@ -201,121 +200,4 @@ async def stream_generator():
                 logger.info("\n\noutput_text:")
                 logger.info(output_text)
 
-            return GeneratedDoc(text=output_text, prompt=input.query)
-
-
-@OpeaComponentRegistry.register("OPEADocSum_TGI")
-class OPEADocSum_TGI(OPEADocSum):
-    """A specialized OPEA DocSum TGI component derived from OPEADocSum for interacting with TGI services based on Lanchain HuggingFaceEndpoint API.
-
-    Attributes:
-        client (TGI): An instance of the TGI client for text generation.
-    """
-
-    def check_health(self) -> bool:
-        """Checks the health of the TGI LLM service.
-
-        Returns:
-            bool: True if the service is reachable and healthy, False otherwise.
-        """
-
-        try:
-            # response = requests.get(f"{self.llm_endpoint}/health")
-
-            # Will remove after TGI gaudi fix health bug
-            url = f"{self.llm_endpoint}/generate"
-            data = {"inputs": "What is Deep Learning?", "parameters": {"max_new_tokens": 17}}
-            headers = {"Content-Type": "application/json"}
-            response = requests.post(url=url, json=data, headers=headers)
-
-            if response.status_code == 200:
-                return True
-            else:
-                return False
-        except Exception as e:
-            logger.error(e)
-            logger.error("Health check failed")
-            return False
-
-    async def invoke(self, input: DocSumLLMParams):
-        """Invokes the TGI LLM service to generate summarization output for the provided input.
-
-        Args:
-            input (DocSumLLMParams): The input text(s).
-        """
-        server_kwargs = {}
-        if self.access_token:
-            server_kwargs["headers"] = {"Authorization": f"Bearer {self.access_token}"}
-
-        if input.stream and input.summary_type == "map_reduce":
-            logger.info("Map Reduce mode don't support stream=True, set to stream=False")
-            input.stream = False
-        self.client = HuggingFaceEndpoint(
-            endpoint_url=self.llm_endpoint,
-            max_new_tokens=input.max_tokens,
-            top_k=input.top_k,
-            top_p=input.top_p,
-            typical_p=input.typical_p,
-            temperature=input.temperature,
-            repetition_penalty=input.repetition_penalty,
-            streaming=input.stream,
-            server_kwargs=server_kwargs,
-        )
-        result = await self.generate(input, self.client)
-
-        return result
-
-
-@OpeaComponentRegistry.register("OPEADocSum_vLLM")
-class OPEADocSum_vLLM(OPEADocSum):
-    """A specialized OPEA DocSum vLLM component derived from OPEADocSum for interacting with vLLM services based on Lanchain VLLMOpenAI API.
-
-    Attributes:
-        client (vLLM): An instance of the vLLM client for text generation.
-    """
-
-    def check_health(self) -> bool:
-        """Checks the health of the vLLM LLM service.
-
-        Returns:
-            bool: True if the service is reachable and healthy, False otherwise.
-        """
-
-        try:
-            response = requests.get(f"{self.llm_endpoint}/health")
-            if response.status_code == 200:
-                return True
-            else:
-                return False
-        except Exception as e:
-            logger.error(e)
-            logger.error("Health check failed")
-            return False
-
-    async def invoke(self, input: DocSumLLMParams):
-        """Invokes the vLLM LLM service to generate summarization output for the provided input.
-
-        Args:
-            input (DocSumLLMParams): The input text(s).
-        """
-        headers = {}
-        if self.access_token:
-            headers = {"Authorization": f"Bearer {self.access_token}"}
-
-        if input.stream and input.summary_type == "map_reduce":
-            logger.info("Map Reduce mode don't support stream=True, set to stream=False")
-            input.stream = False
-        self.client = VLLMOpenAI(
-            openai_api_key="EMPTY",
-            openai_api_base=self.llm_endpoint + "/v1",
-            model_name=MODEL_NAME,
-            default_headers=headers,
-            max_tokens=input.max_tokens,
-            top_p=input.top_p,
-            streaming=input.stream,
-            temperature=input.temperature,
-            presence_penalty=input.repetition_penalty,
-        )
-        result = await self.generate(input, self.client)
-
-        return result
+            return GeneratedDoc(text=output_text, prompt=input.query)
\ No newline at end of file
diff --git a/comps/llms/src/doc-summarization/integrations/tgi.py b/comps/llms/src/doc-summarization/integrations/tgi.py
new file mode 100644
index 0000000000..ec02a1f129
--- /dev/null
+++ b/comps/llms/src/doc-summarization/integrations/tgi.py
@@ -0,0 +1,74 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+import requests
+from langchain_community.llms import HuggingFaceEndpoint
+from .common import *
+
+from comps import CustomLogger, DocSumLLMParams, GeneratedDoc, OpeaComponent, OpeaComponentRegistry, ServiceType
+
+logger = CustomLogger("llm_docsum_tgi")
+logflag = os.getenv("LOGFLAG", False)
+
+@OpeaComponentRegistry.register("OPEADocSum_TGI")
+class OPEADocSum_TGI(OPEADocSum):
+    """A specialized OPEA DocSum TGI component derived from OPEADocSum for interacting with TGI services based on Lanchain HuggingFaceEndpoint API.
+
+    Attributes:
+        client (TGI): An instance of the TGI client for text generation.
+    """
+
+    def check_health(self) -> bool:
+        """Checks the health of the TGI LLM service.
+
+        Returns:
+            bool: True if the service is reachable and healthy, False otherwise.
+        """
+
+        try:
+            # response = requests.get(f"{self.llm_endpoint}/health")
+
+            # Will remove after TGI gaudi fix health bug
+            url = f"{self.llm_endpoint}/generate"
+            data = {"inputs": "What is Deep Learning?", "parameters": {"max_new_tokens": 17}}
+            headers = {"Content-Type": "application/json"}
+            response = requests.post(url=url, json=data, headers=headers)
+
+            if response.status_code == 200:
+                return True
+            else:
+                return False
+        except Exception as e:
+            logger.error(e)
+            logger.error("Health check failed")
+            return False
+
+    async def invoke(self, input: DocSumLLMParams):
+        """Invokes the TGI LLM service to generate summarization output for the provided input.
+
+        Args:
+            input (DocSumLLMParams): The input text(s).
+        """
+        server_kwargs = {}
+        if self.access_token:
+            server_kwargs["headers"] = {"Authorization": f"Bearer {self.access_token}"}
+
+        if input.stream and input.summary_type == "map_reduce":
+            logger.info("Map Reduce mode don't support stream=True, set to stream=False")
+            input.stream = False
+        self.client = HuggingFaceEndpoint(
+            endpoint_url=self.llm_endpoint,
+            max_new_tokens=input.max_tokens,
+            top_k=input.top_k,
+            top_p=input.top_p,
+            typical_p=input.typical_p,
+            temperature=input.temperature,
+            repetition_penalty=input.repetition_penalty,
+            streaming=input.stream,
+            server_kwargs=server_kwargs,
+        )
+        result = await self.generate(input, self.client)
+
+        return result
diff --git a/comps/llms/src/doc-summarization/integrations/vllm.py b/comps/llms/src/doc-summarization/integrations/vllm.py
new file mode 100644
index 0000000000..42e103209f
--- /dev/null
+++ b/comps/llms/src/doc-summarization/integrations/vllm.py
@@ -0,0 +1,66 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+import requests
+from langchain_community.llms import VLLMOpenAI
+from .common import *
+from comps import CustomLogger, DocSumLLMParams, GeneratedDoc, OpeaComponent, OpeaComponentRegistry, ServiceType
+
+logger = CustomLogger("llm_docsum_vllm")
+logflag = os.getenv("LOGFLAG", False)
+
+@OpeaComponentRegistry.register("OPEADocSum_vLLM")
+class OPEADocSum_vLLM(OPEADocSum):
+    """A specialized OPEA DocSum vLLM component derived from OPEADocSum for interacting with vLLM services based on Lanchain VLLMOpenAI API.
+
+    Attributes:
+        client (vLLM): An instance of the vLLM client for text generation.
+    """
+
+    def check_health(self) -> bool:
+        """Checks the health of the vLLM LLM service.
+
+        Returns:
+            bool: True if the service is reachable and healthy, False otherwise.
+        """
+
+        try:
+            response = requests.get(f"{self.llm_endpoint}/health")
+            if response.status_code == 200:
+                return True
+            else:
+                return False
+        except Exception as e:
+            logger.error(e)
+            logger.error("Health check failed")
+            return False
+
+    async def invoke(self, input: DocSumLLMParams):
+        """Invokes the vLLM LLM service to generate summarization output for the provided input.
+
+        Args:
+            input (DocSumLLMParams): The input text(s).
+        """
+        headers = {}
+        if self.access_token:
+            headers = {"Authorization": f"Bearer {self.access_token}"}
+
+        if input.stream and input.summary_type == "map_reduce":
+            logger.info("Map Reduce mode don't support stream=True, set to stream=False")
+            input.stream = False
+        self.client = VLLMOpenAI(
+            openai_api_key="EMPTY",
+            openai_api_base=self.llm_endpoint + "/v1",
+            model_name=MODEL_NAME,
+            default_headers=headers,
+            max_tokens=input.max_tokens,
+            top_p=input.top_p,
+            streaming=input.stream,
+            temperature=input.temperature,
+            presence_penalty=input.repetition_penalty,
+        )
+        result = await self.generate(input, self.client)
+
+        return result
diff --git a/comps/llms/src/doc-summarization/opea_docsum_microservice.py b/comps/llms/src/doc-summarization/opea_docsum_microservice.py
index b7519cacf7..18d9e409f4 100644
--- a/comps/llms/src/doc-summarization/opea_docsum_microservice.py
+++ b/comps/llms/src/doc-summarization/opea_docsum_microservice.py
@@ -4,7 +4,8 @@
 import os
 import time
 
-from integrations.langchain import OPEADocSum_TGI, OPEADocSum_vLLM
+from integrations.tgi import OPEADocSum_TGI
+from integrations.vllm import OPEADocSum_vLLM
 
 from comps import (
     CustomLogger,

From f407abb8f319ef3586825d3e38408e032839b740 Mon Sep 17 00:00:00 2001
From: Xinyao Wang <xinyao.wang@intel.com>
Date: Mon, 13 Jan 2025 10:29:47 +0800
Subject: [PATCH 20/23] fix link

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>
---
 comps/llms/src/doc-summarization/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/comps/llms/src/doc-summarization/README.md b/comps/llms/src/doc-summarization/README.md
index 1d3b210318..f32a0d7d1d 100644
--- a/comps/llms/src/doc-summarization/README.md
+++ b/comps/llms/src/doc-summarization/README.md
@@ -49,7 +49,7 @@ You can choose one as needed.
 ### 1.3.1 Run Docker with CLI (Option A)
 
 Step 1: Start the backend LLM service
-Please refer to [TGI](../../../third_parties/tgi/deployment/docker_compose/) or [vLLM](<(../../../third_parties/vllm/deployment/docker_compose/)>) guideline to start a backend LLM service.
+Please refer to [TGI](../../../third_parties/tgi/deployment/docker_compose/) or [vLLM](../../../third_parties/vllm/deployment/docker_compose/) guideline to start a backend LLM service.
 
 Step 2: Start the DocSum microservices
 

From cf663eee1f5c48480dda2f9a3d614d637360a952 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 13 Jan 2025 03:19:05 +0000
Subject: [PATCH 21/23] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 comps/llms/src/doc-summarization/integrations/common.py | 2 +-
 comps/llms/src/doc-summarization/integrations/tgi.py    | 4 +++-
 comps/llms/src/doc-summarization/integrations/vllm.py   | 5 ++++-
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/comps/llms/src/doc-summarization/integrations/common.py b/comps/llms/src/doc-summarization/integrations/common.py
index f4eb6651a4..3fb0dde092 100644
--- a/comps/llms/src/doc-summarization/integrations/common.py
+++ b/comps/llms/src/doc-summarization/integrations/common.py
@@ -201,4 +201,4 @@ async def stream_generator():
                 logger.info("\n\noutput_text:")
                 logger.info(output_text)
 
-            return GeneratedDoc(text=output_text, prompt=input.query)
\ No newline at end of file
+            return GeneratedDoc(text=output_text, prompt=input.query)
diff --git a/comps/llms/src/doc-summarization/integrations/tgi.py b/comps/llms/src/doc-summarization/integrations/tgi.py
index ec02a1f129..a15c52e7d4 100644
--- a/comps/llms/src/doc-summarization/integrations/tgi.py
+++ b/comps/llms/src/doc-summarization/integrations/tgi.py
@@ -5,13 +5,15 @@
 
 import requests
 from langchain_community.llms import HuggingFaceEndpoint
-from .common import *
 
 from comps import CustomLogger, DocSumLLMParams, GeneratedDoc, OpeaComponent, OpeaComponentRegistry, ServiceType
 
+from .common import *
+
 logger = CustomLogger("llm_docsum_tgi")
 logflag = os.getenv("LOGFLAG", False)
 
+
 @OpeaComponentRegistry.register("OPEADocSum_TGI")
 class OPEADocSum_TGI(OPEADocSum):
     """A specialized OPEA DocSum TGI component derived from OPEADocSum for interacting with TGI services based on Lanchain HuggingFaceEndpoint API.
diff --git a/comps/llms/src/doc-summarization/integrations/vllm.py b/comps/llms/src/doc-summarization/integrations/vllm.py
index 42e103209f..6651fbd203 100644
--- a/comps/llms/src/doc-summarization/integrations/vllm.py
+++ b/comps/llms/src/doc-summarization/integrations/vllm.py
@@ -5,12 +5,15 @@
 
 import requests
 from langchain_community.llms import VLLMOpenAI
-from .common import *
+
 from comps import CustomLogger, DocSumLLMParams, GeneratedDoc, OpeaComponent, OpeaComponentRegistry, ServiceType
 
+from .common import *
+
 logger = CustomLogger("llm_docsum_vllm")
 logflag = os.getenv("LOGFLAG", False)
 
+
 @OpeaComponentRegistry.register("OPEADocSum_vLLM")
 class OPEADocSum_vLLM(OPEADocSum):
     """A specialized OPEA DocSum vLLM component derived from OPEADocSum for interacting with vLLM services based on Lanchain VLLMOpenAI API.

From 33f64f2cc1bc71416ea0ab9e3bdb2dc66ea58b3e Mon Sep 17 00:00:00 2001
From: Xinyao Wang <xinyao.wang@intel.com>
Date: Mon, 13 Jan 2025 13:09:35 +0800
Subject: [PATCH 22/23] rename for ut

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>
---
 .../docker_compose/doc-summarization_tgi_on_intel_hpu.yaml      | 2 +-
 .../llms/test_llms_faq-generation_langchain_tgi_on_intel_hpu.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/comps/llms/deployment/docker_compose/doc-summarization_tgi_on_intel_hpu.yaml b/comps/llms/deployment/docker_compose/doc-summarization_tgi_on_intel_hpu.yaml
index 1424884439..c8562a2c56 100644
--- a/comps/llms/deployment/docker_compose/doc-summarization_tgi_on_intel_hpu.yaml
+++ b/comps/llms/deployment/docker_compose/doc-summarization_tgi_on_intel_hpu.yaml
@@ -6,7 +6,7 @@ version: "3.8"
 services:
   tgi-service:
     image: ghcr.io/huggingface/tgi-gaudi:2.3.1
-    container_name: tgi-gaudi-server
+    container_name: tgi_gaudi_server
     ports:
       - ${LLM_ENDPOINT_PORT:-8008}:80
     volumes:
diff --git a/tests/llms/test_llms_faq-generation_langchain_tgi_on_intel_hpu.sh b/tests/llms/test_llms_faq-generation_langchain_tgi_on_intel_hpu.sh
index 7a1a4fc698..a29113ce02 100644
--- a/tests/llms/test_llms_faq-generation_langchain_tgi_on_intel_hpu.sh
+++ b/tests/llms/test_llms_faq-generation_langchain_tgi_on_intel_hpu.sh
@@ -72,7 +72,7 @@ function validate_backend_microservices() {
         "${host_ip}:${LLM_ENDPOINT_PORT}/generate" \
         "generated_text" \
         "tgi" \
-        "tgi-gaudi-server" \
+        "tgi_gaudi_server" \
         '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
 
     # faq

From fe7f360e01ce3cf304304e59bfd7f44074e5df60 Mon Sep 17 00:00:00 2001
From: Xinyao Wang <xinyao.wang@intel.com>
Date: Mon, 13 Jan 2025 13:45:51 +0800
Subject: [PATCH 23/23] fix bug

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>
---
 .../test_llms_doc-summarization_langchain_tgi_on_intel_hpu.sh   | 2 +-
 .../llms/test_llms_faq-generation_langchain_tgi_on_intel_hpu.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/llms/test_llms_doc-summarization_langchain_tgi_on_intel_hpu.sh b/tests/llms/test_llms_doc-summarization_langchain_tgi_on_intel_hpu.sh
index df67f70460..ebfd5d8f2a 100644
--- a/tests/llms/test_llms_doc-summarization_langchain_tgi_on_intel_hpu.sh
+++ b/tests/llms/test_llms_doc-summarization_langchain_tgi_on_intel_hpu.sh
@@ -78,7 +78,7 @@ function validate_microservices() {
         "${LLM_ENDPOINT}/generate" \
         "generated_text" \
         "tgi" \
-        "tgi-gaudi-server" \
+        "tgi_gaudi_server" \
         '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
 
     echo "Validate stream=True..."
diff --git a/tests/llms/test_llms_faq-generation_langchain_tgi_on_intel_hpu.sh b/tests/llms/test_llms_faq-generation_langchain_tgi_on_intel_hpu.sh
index a29113ce02..7a1a4fc698 100644
--- a/tests/llms/test_llms_faq-generation_langchain_tgi_on_intel_hpu.sh
+++ b/tests/llms/test_llms_faq-generation_langchain_tgi_on_intel_hpu.sh
@@ -72,7 +72,7 @@ function validate_backend_microservices() {
         "${host_ip}:${LLM_ENDPOINT_PORT}/generate" \
         "generated_text" \
         "tgi" \
-        "tgi_gaudi_server" \
+        "tgi-gaudi-server" \
         '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
 
     # faq