From ea72c943bd039231ffd18850fee16274be7c6c4f Mon Sep 17 00:00:00 2001 From: XinyaoWa Date: Mon, 13 Jan 2025 11:30:59 +0800 Subject: [PATCH] Refactor FaqGen (#1093) Signed-off-by: Xinyao Wang Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../docker/compose/llms-compose.yaml | 10 +- .../docker_compose/faq-generation_tgi.yaml | 50 +++++++ .../faq-generation_tgi_on_intel_hpu.yaml | 61 +++++++++ .../docker_compose/faq-generation_vllm.yaml | 53 ++++++++ .../faq-generation_vllm_on_intel_hpu.yaml} | 28 ++-- .../faq-generation/tgi/langchain/README.md | 75 ---------- .../tgi/langchain/docker_compose_llm.yaml | 34 ----- .../tgi/langchain/entrypoint.sh | 8 -- .../llms/faq-generation/tgi/langchain/llm.py | 100 -------------- .../faq-generation/vllm/langchain/Dockerfile | 25 ---- .../faq-generation/vllm/langchain/README.md | 77 ----------- .../faq-generation/vllm/langchain/__init__.py | 2 - .../llms/faq-generation/vllm/langchain/llm.py | 102 -------------- .../vllm/langchain/requirements-runtime.txt | 1 - .../vllm/langchain/requirements.txt | 15 -- .../faq-generation}/Dockerfile | 4 +- comps/llms/src/faq-generation/README.md | 110 +++++++++++++++ .../faq-generation}/entrypoint.sh | 2 +- .../faq-generation/integrations}/__init__.py | 0 .../src/faq-generation/integrations/common.py | 110 +++++++++++++++ .../src/faq-generation/integrations/tgi.py | 73 ++++++++++ .../src/faq-generation/integrations/vllm.py | 65 +++++++++ .../opea_faqgen_microservice.py | 58 ++++++++ .../faq-generation}/requirements-runtime.txt | 0 .../faq-generation}/requirements.txt | 0 .../build_docker_vllm.sh | 0 .../build_docker_vllm_openvino.sh | 0 .../test_llms_faq-generation_langchain_tgi.sh | 114 ++++++++++++++++ ...q-generation_langchain_tgi_on_intel_hpu.sh | 114 ++++++++++++++++ ...-generation_langchain_vllm_on_intel_hpu.sh | 128 ++++++++++++++++++ .../test_llms_faq-generation_tgi_langchain.sh | 94 ------------- 31 files changed, 962 insertions(+), 551 deletions(-) create mode 100644 comps/llms/deployment/docker_compose/faq-generation_tgi.yaml create mode 100644 comps/llms/deployment/docker_compose/faq-generation_tgi_on_intel_hpu.yaml create mode 100644 comps/llms/deployment/docker_compose/faq-generation_vllm.yaml rename comps/llms/{faq-generation/vllm/langchain/docker_compose_llm.yaml => deployment/docker_compose/faq-generation_vllm_on_intel_hpu.yaml} (52%) delete mode 100644 comps/llms/faq-generation/tgi/langchain/README.md delete mode 100644 comps/llms/faq-generation/tgi/langchain/docker_compose_llm.yaml delete mode 100644 comps/llms/faq-generation/tgi/langchain/entrypoint.sh delete mode 100644 comps/llms/faq-generation/tgi/langchain/llm.py delete mode 100644 comps/llms/faq-generation/vllm/langchain/Dockerfile delete mode 100644 comps/llms/faq-generation/vllm/langchain/README.md delete mode 100644 comps/llms/faq-generation/vllm/langchain/__init__.py delete mode 100644 comps/llms/faq-generation/vllm/langchain/llm.py delete mode 100644 comps/llms/faq-generation/vllm/langchain/requirements-runtime.txt delete mode 100644 comps/llms/faq-generation/vllm/langchain/requirements.txt rename comps/llms/{faq-generation/tgi/langchain => src/faq-generation}/Dockerfile (75%) create mode 100644 comps/llms/src/faq-generation/README.md rename comps/llms/{faq-generation/vllm/langchain => src/faq-generation}/entrypoint.sh (81%) rename comps/llms/{faq-generation/tgi/langchain => src/faq-generation/integrations}/__init__.py (100%) create mode 100644 comps/llms/src/faq-generation/integrations/common.py create mode 100644 comps/llms/src/faq-generation/integrations/tgi.py create mode 100644 comps/llms/src/faq-generation/integrations/vllm.py create mode 100644 comps/llms/src/faq-generation/opea_faqgen_microservice.py rename comps/llms/{faq-generation/tgi/langchain => src/faq-generation}/requirements-runtime.txt (100%) rename comps/llms/{faq-generation/tgi/langchain => src/faq-generation}/requirements.txt (100%) rename comps/third_parties/vllm/{deployment/docker_compose => src}/build_docker_vllm.sh (100%) rename comps/third_parties/vllm/{deployment/docker_compose => src}/build_docker_vllm_openvino.sh (100%) create mode 100644 tests/llms/test_llms_faq-generation_langchain_tgi.sh create mode 100644 tests/llms/test_llms_faq-generation_langchain_tgi_on_intel_hpu.sh create mode 100644 tests/llms/test_llms_faq-generation_langchain_vllm_on_intel_hpu.sh delete mode 100644 tests/llms/test_llms_faq-generation_tgi_langchain.sh diff --git a/.github/workflows/docker/compose/llms-compose.yaml b/.github/workflows/docker/compose/llms-compose.yaml index a86c035af7..864d74bd80 100644 --- a/.github/workflows/docker/compose/llms-compose.yaml +++ b/.github/workflows/docker/compose/llms-compose.yaml @@ -15,10 +15,10 @@ services: build: dockerfile: comps/llms/summarization/tgi/langchain/Dockerfile image: ${REGISTRY:-opea}/llm-docsum-tgi:${TAG:-latest} - llm-faqgen-tgi: + llm-faqgen: build: - dockerfile: comps/llms/faq-generation/tgi/langchain/Dockerfile - image: ${REGISTRY:-opea}/llm-faqgen-tgi:${TAG:-latest} + dockerfile: comps/llms/src/faq-generation/Dockerfile + image: ${REGISTRY:-opea}/llm-faqgen:${TAG:-latest} llm-native: build: dockerfile: comps/llms/text-generation/native/langchain/Dockerfile @@ -54,7 +54,3 @@ services: build: dockerfile: comps/llms/summarization/vllm/langchain/Dockerfile image: ${REGISTRY:-opea}/llm-docsum-vllm:${TAG:-latest} - llm-faqgen-vllm: - build: - dockerfile: comps/llms/faq-generation/vllm/langchain/Dockerfile - image: ${REGISTRY:-opea}/llm-faqgen-vllm:${TAG:-latest} diff --git a/comps/llms/deployment/docker_compose/faq-generation_tgi.yaml b/comps/llms/deployment/docker_compose/faq-generation_tgi.yaml new file mode 100644 index 0000000000..8b56031dfb --- /dev/null +++ b/comps/llms/deployment/docker_compose/faq-generation_tgi.yaml @@ -0,0 +1,50 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3.8" + +services: + tgi-service: + image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu + container_name: tgi-server + ports: + - ${LLM_ENDPOINT_PORT:-8008}:80 + volumes: + - "./data:/data" + shm_size: 1g + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + host_ip: ${host_ip} + LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT} + healthcheck: + test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"] + interval: 10s + timeout: 10s + retries: 100 + command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0 + llm: + image: opea/llm-faqgen:latest + container_name: llm-faqgen-server + depends_on: + tgi-service: + condition: service_healthy + ports: + - ${FAQ_PORT:-9000}:9000 + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + LLM_ENDPOINT: ${LLM_ENDPOINT} + LLM_MODEL_ID: ${LLM_MODEL_ID} + HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + FAQGen_COMPONENT_NAME: ${FAQGen_COMPONENT_NAME} + LOGFLAG: ${LOGFLAG:-False} + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/llms/deployment/docker_compose/faq-generation_tgi_on_intel_hpu.yaml b/comps/llms/deployment/docker_compose/faq-generation_tgi_on_intel_hpu.yaml new file mode 100644 index 0000000000..1ce0ba80a1 --- /dev/null +++ b/comps/llms/deployment/docker_compose/faq-generation_tgi_on_intel_hpu.yaml @@ -0,0 +1,61 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3.8" + +services: + tgi-service: + image: ghcr.io/huggingface/tgi-gaudi:2.3.1 + container_name: tgi-gaudi-server + ports: + - ${LLM_ENDPOINT_PORT:-8008}:80 + volumes: + - "./data:/data" + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + HF_HUB_DISABLE_PROGRESS_BARS: 1 + HF_HUB_ENABLE_HF_TRANSFER: 0 + HABANA_VISIBLE_DEVICES: all + OMPI_MCA_btl_vader_single_copy_mechanism: none + ENABLE_HPU_GRAPH: true + LIMIT_HPU_GRAPH: true + USE_FLASH_ATTENTION: true + FLASH_ATTENTION_RECOMPUTE: true + host_ip: ${host_ip} + LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT} + runtime: habana + cap_add: + - SYS_NICE + ipc: host + healthcheck: + test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"] + interval: 10s + timeout: 10s + retries: 100 + command: --model-id ${LLM_MODEL_ID} --max-input-length 1024 --max-total-tokens 2048 + llm: + image: opea/llm-faqgen:latest + container_name: llm-faqgen-server + depends_on: + tgi-service: + condition: service_healthy + ports: + - ${FAQ_PORT:-9000}:9000 + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + LLM_ENDPOINT: ${LLM_ENDPOINT} + LLM_MODEL_ID: ${LLM_MODEL_ID} + HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + FAQGen_COMPONENT_NAME: ${FAQGen_COMPONENT_NAME} + LOGFLAG: ${LOGFLAG:-False} + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/llms/deployment/docker_compose/faq-generation_vllm.yaml b/comps/llms/deployment/docker_compose/faq-generation_vllm.yaml new file mode 100644 index 0000000000..7ae89c0fb8 --- /dev/null +++ b/comps/llms/deployment/docker_compose/faq-generation_vllm.yaml @@ -0,0 +1,53 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3.8" + +services: + vllm-service: + image: opea/vllm:latest + container_name: vllm-server + ports: + - ${LLM_ENDPOINT_PORT:-8008}:80 + volumes: + - "./data:/data" + shm_size: 128g + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + LLM_MODEL_ID: ${LLM_MODEL_ID} + VLLM_TORCH_PROFILER_DIR: "/mnt" + host_ip: ${host_ip} + LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT} + VLLM_SKIP_WARMUP: ${VLLM_SKIP_WARMUP:-false} + healthcheck: + test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"] + interval: 10s + timeout: 10s + retries: 100 + command: --model $LLM_MODEL_ID --host 0.0.0.0 --port 80 + llm: + image: opea/llm-faqgen:latest + container_name: llm-faqgen-server + depends_on: + vllm-service: + condition: service_healthy + ports: + - ${FAQ_PORT:-9000}:9000 + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + LLM_ENDPOINT: ${LLM_ENDPOINT} + LLM_MODEL_ID: ${LLM_MODEL_ID} + HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + FAQGen_COMPONENT_NAME: ${FAQGen_COMPONENT_NAME} + LOGFLAG: ${LOGFLAG:-False} + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/llms/faq-generation/vllm/langchain/docker_compose_llm.yaml b/comps/llms/deployment/docker_compose/faq-generation_vllm_on_intel_hpu.yaml similarity index 52% rename from comps/llms/faq-generation/vllm/langchain/docker_compose_llm.yaml rename to comps/llms/deployment/docker_compose/faq-generation_vllm_on_intel_hpu.yaml index 8b26dd751d..fc5b6c9d89 100644 --- a/comps/llms/faq-generation/vllm/langchain/docker_compose_llm.yaml +++ b/comps/llms/deployment/docker_compose/faq-generation_vllm_on_intel_hpu.yaml @@ -8,37 +8,49 @@ services: image: opea/vllm-gaudi:latest container_name: vllm-gaudi-server ports: - - "8008:80" + - ${LLM_ENDPOINT_PORT:-8008}:80 volumes: - "./data:/data" environment: no_proxy: ${no_proxy} http_proxy: ${http_proxy} https_proxy: ${https_proxy} - HF_TOKEN: ${HF_TOKEN} + HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} HABANA_VISIBLE_DEVICES: all OMPI_MCA_btl_vader_single_copy_mechanism: none LLM_MODEL_ID: ${LLM_MODEL_ID} + VLLM_TORCH_PROFILER_DIR: "/mnt" + host_ip: ${host_ip} + LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT} + VLLM_SKIP_WARMUP: ${VLLM_SKIP_WARMUP:-false} runtime: habana cap_add: - SYS_NICE ipc: host - command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 + healthcheck: + test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"] + interval: 10s + timeout: 10s + retries: 100 + command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048 llm: - image: opea/llm-faqgen-vllm:latest + image: opea/llm-faqgen:latest container_name: llm-faqgen-server depends_on: - - vllm-service + vllm-service: + condition: service_healthy ports: - - "9000:9000" + - ${FAQ_PORT:-9000}:9000 ipc: host environment: no_proxy: ${no_proxy} http_proxy: ${http_proxy} https_proxy: ${https_proxy} - vLLM_ENDPOINT: ${vLLM_ENDPOINT} - HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN} + LLM_ENDPOINT: ${LLM_ENDPOINT} LLM_MODEL_ID: ${LLM_MODEL_ID} + HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + FAQGen_COMPONENT_NAME: ${FAQGen_COMPONENT_NAME} + LOGFLAG: ${LOGFLAG:-False} restart: unless-stopped networks: diff --git a/comps/llms/faq-generation/tgi/langchain/README.md b/comps/llms/faq-generation/tgi/langchain/README.md deleted file mode 100644 index 8a497e091f..0000000000 --- a/comps/llms/faq-generation/tgi/langchain/README.md +++ /dev/null @@ -1,75 +0,0 @@ -# TGI FAQGen LLM Microservice - -This microservice interacts with the TGI LLM server to generate FAQs from Input Text.[Text Generation Inference](https://github.com/huggingface/text-generation-inference) (TGI) is a toolkit for deploying and serving Large Language Models (LLMs). TGI enables high-performance text generation for the most popular open-source LLMs, including Llama, Falcon, StarCoder, BLOOM, GPT-NeoX, and more. - -## 🚀1. Start Microservice with Docker - -If you start an LLM microservice with docker, the `docker_compose_llm.yaml` file will automatically start a TGI service with docker. - -### 1.1 Setup Environment Variables - -In order to start TGI and LLM services, you need to setup the following environment variables first. - -```bash -export HF_TOKEN=${your_hf_api_token} -export TGI_LLM_ENDPOINT="http://${your_ip}:8008" -export LLM_MODEL_ID=${your_hf_llm_model} -``` - -### 1.2 Build Docker Image - -```bash -cd ../../../../../ -docker build -t opea/llm-faqgen-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/faq-generation/tgi/langchain/Dockerfile . -``` - -To start a docker container, you have two options: - -- A. Run Docker with CLI -- B. Run Docker with Docker Compose - -You can choose one as needed. - -### 1.3 Run Docker with CLI (Option A) - -```bash -docker run -d -p 8008:80 -v ./data:/data --name tgi_service --shm-size 1g ghcr.io/huggingface/text-generation-inference:1.4 --model-id ${LLM_MODEL_ID} -``` - -```bash -docker run -d --name="llm-faqgen-server" -p 9000:9000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e TGI_LLM_ENDPOINT=$TGI_LLM_ENDPOINT -e HUGGINGFACEHUB_API_TOKEN=$HF_TOKEN opea/llm-faqgen-tgi:latest -``` - -### 1.4 Run Docker with Docker Compose (Option B) - -```bash -docker compose -f docker_compose_llm.yaml up -d -``` - -## 🚀3. Consume LLM Service - -### 3.1 Check Service Status - -```bash -curl http://${your_ip}:9000/v1/health_check\ - -X GET \ - -H 'Content-Type: application/json' -``` - -### 3.2 Consume FAQGen LLM Service - -```bash -# Streaming Response -# Set stream to True. Default will be True. -curl http://${your_ip}:9000/v1/faqgen \ - -X POST \ - -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \ - -H 'Content-Type: application/json' - -# Non-Streaming Response -# Set stream to False. -curl http://${your_ip}:9000/v1/faqgen \ - -X POST \ - -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "stream":false}' \ - -H 'Content-Type: application/json' -``` diff --git a/comps/llms/faq-generation/tgi/langchain/docker_compose_llm.yaml b/comps/llms/faq-generation/tgi/langchain/docker_compose_llm.yaml deleted file mode 100644 index afe0c0f7c3..0000000000 --- a/comps/llms/faq-generation/tgi/langchain/docker_compose_llm.yaml +++ /dev/null @@ -1,34 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -version: "3.8" - -services: - tgi_service: - image: ghcr.io/huggingface/text-generation-inference:1.4 - container_name: tgi-service - ports: - - "8008:80" - volumes: - - "./data:/data" - environment: - HF_TOKEN: ${HF_TOKEN} - shm_size: 1g - command: --model-id ${LLM_MODEL_ID} - llm: - image: opea/llm-faqgen-tgi:latest - container_name: llm-faqgen-server - ports: - - "9000:9000" - ipc: host - environment: - no_proxy: ${no_proxy} - http_proxy: ${http_proxy} - https_proxy: ${https_proxy} - TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT} - HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN} - restart: unless-stopped - -networks: - default: - driver: bridge diff --git a/comps/llms/faq-generation/tgi/langchain/entrypoint.sh b/comps/llms/faq-generation/tgi/langchain/entrypoint.sh deleted file mode 100644 index d60eddd36b..0000000000 --- a/comps/llms/faq-generation/tgi/langchain/entrypoint.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/env bash - -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -pip --no-cache-dir install -r requirements-runtime.txt - -python llm.py diff --git a/comps/llms/faq-generation/tgi/langchain/llm.py b/comps/llms/faq-generation/tgi/langchain/llm.py deleted file mode 100644 index 62456d42ea..0000000000 --- a/comps/llms/faq-generation/tgi/langchain/llm.py +++ /dev/null @@ -1,100 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import os - -from fastapi.responses import StreamingResponse -from langchain.chains.summarize import load_summarize_chain -from langchain.docstore.document import Document -from langchain.prompts import PromptTemplate -from langchain.text_splitter import CharacterTextSplitter -from langchain_community.llms import HuggingFaceEndpoint - -from comps import CustomLogger, GeneratedDoc, LLMParamsDoc, ServiceType, opea_microservices, register_microservice -from comps.cores.mega.utils import get_access_token - -logger = CustomLogger("llm_faqgen") -logflag = os.getenv("LOGFLAG", False) - -# Environment variables -TOKEN_URL = os.getenv("TOKEN_URL") -CLIENTID = os.getenv("CLIENTID") -CLIENT_SECRET = os.getenv("CLIENT_SECRET") - - -def post_process_text(text: str): - if text == " ": - return "data: @#$\n\n" - if text == "\n": - return "data:
\n\n" - if text.isspace(): - return None - new_text = text.replace(" ", "@#$") - return f"data: {new_text}\n\n" - - -@register_microservice( - name="opea_service@llm_faqgen", - service_type=ServiceType.LLM, - endpoint="/v1/faqgen", - host="0.0.0.0", - port=9000, -) -async def llm_generate(input: LLMParamsDoc): - if logflag: - logger.info(input) - access_token = ( - get_access_token(TOKEN_URL, CLIENTID, CLIENT_SECRET) if TOKEN_URL and CLIENTID and CLIENT_SECRET else None - ) - server_kwargs = {} - if access_token: - server_kwargs["headers"] = {"Authorization": f"Bearer {access_token}"} - llm = HuggingFaceEndpoint( - endpoint_url=llm_endpoint, - max_new_tokens=input.max_tokens, - top_k=input.top_k, - top_p=input.top_p, - typical_p=input.typical_p, - temperature=input.temperature, - repetition_penalty=input.repetition_penalty, - streaming=input.stream, - server_kwargs=server_kwargs, - ) - templ = """Create a concise FAQs (frequently asked questions and answers) for following text: - TEXT: {text} - Do not use any prefix or suffix to the FAQ. - """ - PROMPT = PromptTemplate.from_template(templ) - llm_chain = load_summarize_chain(llm=llm, prompt=PROMPT) - texts = text_splitter.split_text(input.query) - - # Create multiple documents - docs = [Document(page_content=t) for t in texts] - - if input.stream: - - async def stream_generator(): - from langserve.serialization import WellKnownLCSerializer - - _serializer = WellKnownLCSerializer() - async for chunk in llm_chain.astream_log(docs): - data = _serializer.dumps({"ops": chunk.ops}).decode("utf-8") - if logflag: - logger.info(data) - yield f"data: {data}\n\n" - yield "data: [DONE]\n\n" - - return StreamingResponse(stream_generator(), media_type="text/event-stream") - else: - response = await llm_chain.ainvoke(docs) - response = response["output_text"] - if logflag: - logger.info(response) - return GeneratedDoc(text=response, prompt=input.query) - - -if __name__ == "__main__": - llm_endpoint = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080") - # Split text - text_splitter = CharacterTextSplitter() - opea_microservices["opea_service@llm_faqgen"].start() diff --git a/comps/llms/faq-generation/vllm/langchain/Dockerfile b/comps/llms/faq-generation/vllm/langchain/Dockerfile deleted file mode 100644 index 793d5a7311..0000000000 --- a/comps/llms/faq-generation/vllm/langchain/Dockerfile +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -FROM python:3.11-slim - -RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ - libgl1-mesa-glx \ - libjemalloc-dev - -RUN useradd -m -s /bin/bash user && \ - mkdir -p /home/user && \ - chown -R user /home/user/ - -USER user - -COPY comps /home/user/comps - -RUN pip install --no-cache-dir --upgrade pip setuptools && \ - pip install --no-cache-dir -r /home/user/comps/llms/faq-generation/vllm/langchain/requirements.txt - -ENV PYTHONPATH=$PYTHONPATH:/home/user - -WORKDIR /home/user/comps/llms/faq-generation/vllm/langchain - -ENTRYPOINT ["bash", "entrypoint.sh"] diff --git a/comps/llms/faq-generation/vllm/langchain/README.md b/comps/llms/faq-generation/vllm/langchain/README.md deleted file mode 100644 index 07fa7b4317..0000000000 --- a/comps/llms/faq-generation/vllm/langchain/README.md +++ /dev/null @@ -1,77 +0,0 @@ -# vLLM FAQGen LLM Microservice - -This microservice interacts with the vLLM server to generate FAQs from Input Text.[vLLM](https://github.com/vllm-project/vllm) is a fast and easy-to-use library for LLM inference and serving, it delivers state-of-the-art serving throughput with a set of advanced features such as PagedAttention, Continuous batching and etc.. Besides GPUs, vLLM already supported [Intel CPUs](https://www.intel.com/content/www/us/en/products/overview.html) and [Gaudi accelerators](https://habana.ai/products). - -## 🚀1. Start Microservice with Docker - -If you start an LLM microservice with docker, the `docker_compose_llm.yaml` file will automatically start a VLLM service with docker. - -To setup or build the vLLM image follow the instructions provided in [vLLM Gaudi](https://github.com/opea-project/GenAIComps/tree/main/comps/llms/text-generation/vllm/langchain#22-vllm-on-gaudi) - -### 1.1 Setup Environment Variables - -In order to start vLLM and LLM services, you need to setup the following environment variables first. - -```bash -export HF_TOKEN=${your_hf_api_token} -export vLLM_ENDPOINT="http://${your_ip}:8008" -export LLM_MODEL_ID=${your_hf_llm_model} -``` - -### 1.3 Build Docker Image - -```bash -cd ../../../../../ -docker build -t opea/llm-faqgen-vllm:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/faq-generation/vllm/langchain/Dockerfile . -``` - -To start a docker container, you have two options: - -- A. Run Docker with CLI -- B. Run Docker with Docker Compose - -You can choose one as needed. - -### 1.3 Run Docker with CLI (Option A) - -```bash -docker run -d -p 8008:80 -v ./data:/data --name vllm-service --shm-size 1g opea/vllm-gaudi:latest --model-id ${LLM_MODEL_ID} -``` - -```bash -docker run -d --name="llm-faqgen-server" -p 9000:9000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e vLLM_ENDPOINT=$vLLM_ENDPOINT -e HUGGINGFACEHUB_API_TOKEN=$HF_TOKEN opea/llm-faqgen-vllm:latest -``` - -### 1.4 Run Docker with Docker Compose (Option B) - -```bash -docker compose -f docker_compose_llm.yaml up -d -``` - -## 🚀3. Consume LLM Service - -### 3.1 Check Service Status - -```bash -curl http://${your_ip}:9000/v1/health_check\ - -X GET \ - -H 'Content-Type: application/json' -``` - -### 3.2 Consume FAQGen LLM Service - -```bash -# Streaming Response -# Set stream to True. Default will be True. -curl http://${your_ip}:9000/v1/faqgen \ - -X POST \ - -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \ - -H 'Content-Type: application/json' - -# Non-Streaming Response -# Set stream to False. -curl http://${your_ip}:9000/v1/faqgen \ - -X POST \ - -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "stream":false}' \ - -H 'Content-Type: application/json' -``` diff --git a/comps/llms/faq-generation/vllm/langchain/__init__.py b/comps/llms/faq-generation/vllm/langchain/__init__.py deleted file mode 100644 index 916f3a44b2..0000000000 --- a/comps/llms/faq-generation/vllm/langchain/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/llms/faq-generation/vllm/langchain/llm.py b/comps/llms/faq-generation/vllm/langchain/llm.py deleted file mode 100644 index 4a0ffab76c..0000000000 --- a/comps/llms/faq-generation/vllm/langchain/llm.py +++ /dev/null @@ -1,102 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import os - -from fastapi.responses import StreamingResponse -from langchain.chains.summarize import load_summarize_chain -from langchain.docstore.document import Document -from langchain.prompts import PromptTemplate -from langchain.text_splitter import CharacterTextSplitter -from langchain_community.llms import VLLMOpenAI - -from comps import CustomLogger, GeneratedDoc, LLMParamsDoc, ServiceType, opea_microservices, register_microservice -from comps.cores.mega.utils import get_access_token - -logger = CustomLogger("llm_faqgen") -logflag = os.getenv("LOGFLAG", False) - -# Environment variables -TOKEN_URL = os.getenv("TOKEN_URL") -CLIENTID = os.getenv("CLIENTID") -CLIENT_SECRET = os.getenv("CLIENT_SECRET") - - -def post_process_text(text: str): - if text == " ": - return "data: @#$\n\n" - if text == "\n": - return "data:
\n\n" - if text.isspace(): - return None - new_text = text.replace(" ", "@#$") - return f"data: {new_text}\n\n" - - -@register_microservice( - name="opea_service@llm_faqgen", - service_type=ServiceType.LLM, - endpoint="/v1/faqgen", - host="0.0.0.0", - port=9000, -) -async def llm_generate(input: LLMParamsDoc): - if logflag: - logger.info(input) - access_token = ( - get_access_token(TOKEN_URL, CLIENTID, CLIENT_SECRET) if TOKEN_URL and CLIENTID and CLIENT_SECRET else None - ) - headers = {} - if access_token: - headers = {"Authorization": f"Bearer {access_token}"} - - model = input.model if input.model else os.getenv("LLM_MODEL_ID") - llm = VLLMOpenAI( - openai_api_key="EMPTY", - openai_api_base=llm_endpoint + "/v1", - model_name=model, - default_headers=headers, - max_tokens=input.max_tokens, - top_p=input.top_p, - streaming=input.stream, - temperature=input.temperature, - ) - - templ = """Create a concise FAQs (frequently asked questions and answers) for following text: - TEXT: {text} - Do not use any prefix or suffix to the FAQ. - """ - PROMPT = PromptTemplate.from_template(templ) - llm_chain = load_summarize_chain(llm=llm, prompt=PROMPT) - texts = text_splitter.split_text(input.query) - - # Create multiple documents - docs = [Document(page_content=t) for t in texts] - - if input.stream: - - async def stream_generator(): - from langserve.serialization import WellKnownLCSerializer - - _serializer = WellKnownLCSerializer() - async for chunk in llm_chain.astream_log(docs): - data = _serializer.dumps({"ops": chunk.ops}).decode("utf-8") - if logflag: - logger.info(data) - yield f"data: {data}\n\n" - yield "data: [DONE]\n\n" - - return StreamingResponse(stream_generator(), media_type="text/event-stream") - else: - response = await llm_chain.ainvoke(docs) - response = response["output_text"] - if logflag: - logger.info(response) - return GeneratedDoc(text=response, prompt=input.query) - - -if __name__ == "__main__": - llm_endpoint = os.getenv("vLLM_ENDPOINT", "http://localhost:8080") - # Split text - text_splitter = CharacterTextSplitter() - opea_microservices["opea_service@llm_faqgen"].start() diff --git a/comps/llms/faq-generation/vllm/langchain/requirements-runtime.txt b/comps/llms/faq-generation/vllm/langchain/requirements-runtime.txt deleted file mode 100644 index 225adde271..0000000000 --- a/comps/llms/faq-generation/vllm/langchain/requirements-runtime.txt +++ /dev/null @@ -1 +0,0 @@ -langserve diff --git a/comps/llms/faq-generation/vllm/langchain/requirements.txt b/comps/llms/faq-generation/vllm/langchain/requirements.txt deleted file mode 100644 index 36257d3939..0000000000 --- a/comps/llms/faq-generation/vllm/langchain/requirements.txt +++ /dev/null @@ -1,15 +0,0 @@ -docarray[full] -fastapi -huggingface_hub -langchain -langchain-huggingface -langchain-openai -langchain_community -langchainhub -opentelemetry-api -opentelemetry-exporter-otlp -opentelemetry-sdk -prometheus-fastapi-instrumentator -shortuuid -transformers -uvicorn diff --git a/comps/llms/faq-generation/tgi/langchain/Dockerfile b/comps/llms/src/faq-generation/Dockerfile similarity index 75% rename from comps/llms/faq-generation/tgi/langchain/Dockerfile rename to comps/llms/src/faq-generation/Dockerfile index 357343595e..90439a6542 100644 --- a/comps/llms/faq-generation/tgi/langchain/Dockerfile +++ b/comps/llms/src/faq-generation/Dockerfile @@ -16,10 +16,10 @@ USER user COPY comps /home/user/comps RUN pip install --no-cache-dir --upgrade pip setuptools && \ - pip install --no-cache-dir -r /home/user/comps/llms/faq-generation/tgi/langchain/requirements.txt + pip install --no-cache-dir -r /home/user/comps/llms/src/faq-generation/requirements.txt ENV PYTHONPATH=$PYTHONPATH:/home/user -WORKDIR /home/user/comps/llms/faq-generation/tgi/langchain +WORKDIR /home/user/comps/llms/src/faq-generation ENTRYPOINT ["bash", "entrypoint.sh"] diff --git a/comps/llms/src/faq-generation/README.md b/comps/llms/src/faq-generation/README.md new file mode 100644 index 0000000000..1a57c90d77 --- /dev/null +++ b/comps/llms/src/faq-generation/README.md @@ -0,0 +1,110 @@ +# FAQGen LLM Microservice + +This microservice interacts with the TGI/vLLM LLM server to generate FAQs(frequently asked questions and answers) from Input Text. You can set backend service either [TGI](../../../third_parties/tgi) or [vLLM](../../../third_parties/vllm). + +## 🚀1. Start Microservice with Docker + +### 1.1 Setup Environment Variables + +In order to start FaqGen microservices, you need to setup the following environment variables first. + +```bash +export host_ip=${your_host_ip} +export LLM_ENDPOINT_PORT=8008 +export FAQ_PORT=9000 +export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token} +export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}" +export LLM_MODEL_ID=${your_hf_llm_model} +export FAQGen_COMPONENT_NAME="OPEAFAQGen_TGI" # or "vllm" +``` + +### 1.2 Build Docker Image + +Step 1: Prepare backend LLM docker image. + +If you want to use vLLM backend, refer to [vLLM](../../../third_parties/vllm/src) to build vLLM docker images first. + +No need for TGI. + +Step 2: Build FaqGen docker image. + +```bash +cd ../../../../ +docker build -t opea/llm-faqgen:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/faq-generation/Dockerfile . +``` + +### 1.3 Run Docker + +To start a docker container, you have two options: + +- A. Run Docker with CLI +- B. Run Docker with Docker Compose + +You can choose one as needed. + +#### 1.3.1 Run Docker with CLI (Option A) + +Step 1: Start the backend LLM service +Please refer to [TGI](../../../third_parties/tgi/deployment/docker_compose/) or [vLLM](../../../third_parties/vllm/deployment/docker_compose/) guideline to start a backend LLM service. + +Step 2: Start the FaqGen microservices + +```bash +docker run -d \ + --name="llm-faqgen-server" \ + -p 9000:9000 \ + --ipc=host \ + -e http_proxy=$http_proxy \ + -e https_proxy=$https_proxy \ + -e LLM_MODEL_ID=$LLM_MODEL_ID \ + -e LLM_ENDPOINT=$LLM_ENDPOINT \ + -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN \ + -e FAQGen_COMPONENT_NAME=$FAQGen_COMPONENT_NAME \ + opea/llm-faqgen:latest +``` + +#### 1.3.2 Run Docker with Docker Compose (Option B) + +```bash +cd ../../deployment/docker_compose/ + +# Backend is TGI on xeon +docker compose -f faq-generation_tgi.yaml up -d + +# Backend is TGI on gaudi +# docker compose -f faq-generation_tgi_on_intel_hpu.yaml up -d + +# Backend is vLLM on xeon +# docker compose -f faq-generation_vllm.yaml up -d + +# Backend is vLLM on gaudi +# docker compose -f faq-generation_vllm_on_intel_hpu.yaml up -d +``` + +## 🚀2. Consume LLM Service + +### 2.1 Check Service Status + +```bash +curl http://${host_ip}:${FAQ_PORT}/v1/health_check\ + -X GET \ + -H 'Content-Type: application/json' +``` + +### 2.2 Consume FAQGen LLM Service + +```bash +# Streaming Response +# Set stream to True. Default will be True. +curl http://${host_ip}:${FAQ_PORT}/v1/faqgen \ + -X POST \ + -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens": 128}' \ + -H 'Content-Type: application/json' + +# Non-Streaming Response +# Set stream to False. +curl http://${host_ip}:${FAQ_PORT}/v1/faqgen \ + -X POST \ + -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens": 128, "stream":false}' \ + -H 'Content-Type: application/json' +``` diff --git a/comps/llms/faq-generation/vllm/langchain/entrypoint.sh b/comps/llms/src/faq-generation/entrypoint.sh similarity index 81% rename from comps/llms/faq-generation/vllm/langchain/entrypoint.sh rename to comps/llms/src/faq-generation/entrypoint.sh index d60eddd36b..d3ad707a59 100644 --- a/comps/llms/faq-generation/vllm/langchain/entrypoint.sh +++ b/comps/llms/src/faq-generation/entrypoint.sh @@ -5,4 +5,4 @@ pip --no-cache-dir install -r requirements-runtime.txt -python llm.py +python opea_faqgen_microservice.py diff --git a/comps/llms/faq-generation/tgi/langchain/__init__.py b/comps/llms/src/faq-generation/integrations/__init__.py similarity index 100% rename from comps/llms/faq-generation/tgi/langchain/__init__.py rename to comps/llms/src/faq-generation/integrations/__init__.py diff --git a/comps/llms/src/faq-generation/integrations/common.py b/comps/llms/src/faq-generation/integrations/common.py new file mode 100644 index 0000000000..6d756a3ab7 --- /dev/null +++ b/comps/llms/src/faq-generation/integrations/common.py @@ -0,0 +1,110 @@ +# Copyright (C) 2024 Prediction Guard, Inc. +# SPDX-License-Identified: Apache-2.0 + +import os + +import requests +from fastapi.responses import StreamingResponse +from langchain.chains.summarize import load_summarize_chain +from langchain.docstore.document import Document +from langchain.text_splitter import CharacterTextSplitter +from langchain_core.prompts import PromptTemplate + +from comps import CustomLogger, GeneratedDoc, LLMParamsDoc, OpeaComponent, ServiceType +from comps.cores.mega.utils import ConfigError, get_access_token, load_model_configs + +logger = CustomLogger("opea_faqgen") +logflag = os.getenv("LOGFLAG", False) + +templ = """Create a concise FAQs (frequently asked questions and answers) for following text: + TEXT: {text} + Do not use any prefix or suffix to the FAQ. + """ + +# Environment variables +MODEL_NAME = os.getenv("LLM_MODEL_ID") +MODEL_CONFIGS = os.getenv("MODEL_CONFIGS") +TOKEN_URL = os.getenv("TOKEN_URL") +CLIENTID = os.getenv("CLIENTID") +CLIENT_SECRET = os.getenv("CLIENT_SECRET") + +if os.getenv("LLM_ENDPOINT") is not None: + DEFAULT_ENDPOINT = os.getenv("LLM_ENDPOINT") +elif os.getenv("TGI_LLM_ENDPOINT") is not None: + DEFAULT_ENDPOINT = os.getenv("TGI_LLM_ENDPOINT") +elif os.getenv("vLLM_ENDPOINT") is not None: + DEFAULT_ENDPOINT = os.getenv("vLLM_ENDPOINT") +else: + DEFAULT_ENDPOINT = "http://localhost:8080" + + +def get_llm_endpoint(): + if not MODEL_CONFIGS: + return DEFAULT_ENDPOINT + else: + # Validate and Load the models config if MODEL_CONFIGS is not null + configs_map = {} + try: + configs_map = load_model_configs(MODEL_CONFIGS) + except ConfigError as e: + logger.error(f"Failed to load model configurations: {e}") + raise ConfigError(f"Failed to load model configurations: {e}") + try: + return configs_map.get(MODEL_NAME).get("endpoint") + except ConfigError as e: + logger.error(f"Input model {MODEL_NAME} not present in model_configs. Error {e}") + raise ConfigError(f"Input model {MODEL_NAME} not present in model_configs") + + +class OPEAFAQGen(OpeaComponent): + """A specialized OPEA FAQGen component derived from OpeaComponent. + + Attributes: + client (TGI/vLLM): An instance of the TGI/vLLM client for text generation. + """ + + def __init__(self, name: str, description: str, config: dict = None): + super().__init__(name, ServiceType.LLM.name.lower(), description, config) + self.access_token = ( + get_access_token(TOKEN_URL, CLIENTID, CLIENT_SECRET) if TOKEN_URL and CLIENTID and CLIENT_SECRET else None + ) + self.text_splitter = CharacterTextSplitter() + self.llm_endpoint = get_llm_endpoint() + health_status = self.check_health() + if not health_status: + logger.error("OPEAFAQGen health check failed.") + + async def generate(self, input: LLMParamsDoc, client): + """Invokes the TGI/vLLM LLM service to generate FAQ output for the provided input. + + Args: + input (LLMParamsDoc): The input text(s). + client: TGI/vLLM based client + """ + PROMPT = PromptTemplate.from_template(templ) + llm_chain = load_summarize_chain(llm=client, prompt=PROMPT) + texts = self.text_splitter.split_text(input.query) + + # Create multiple documents + docs = [Document(page_content=t) for t in texts] + + if input.stream: + + async def stream_generator(): + from langserve.serialization import WellKnownLCSerializer + + _serializer = WellKnownLCSerializer() + async for chunk in llm_chain.astream_log(docs): + data = _serializer.dumps({"ops": chunk.ops}).decode("utf-8") + if logflag: + logger.info(data) + yield f"data: {data}\n\n" + yield "data: [DONE]\n\n" + + return StreamingResponse(stream_generator(), media_type="text/event-stream") + else: + response = await llm_chain.ainvoke(docs) + response = response["output_text"] + if logflag: + logger.info(response) + return GeneratedDoc(text=response, prompt=input.query) diff --git a/comps/llms/src/faq-generation/integrations/tgi.py b/comps/llms/src/faq-generation/integrations/tgi.py new file mode 100644 index 0000000000..41fa7b58b5 --- /dev/null +++ b/comps/llms/src/faq-generation/integrations/tgi.py @@ -0,0 +1,73 @@ +# Copyright (C) 2024 Prediction Guard, Inc. +# SPDX-License-Identified: Apache-2.0 + +import os + +import requests +from langchain_community.llms import HuggingFaceEndpoint + +from comps import CustomLogger, GeneratedDoc, LLMParamsDoc, OpeaComponent, OpeaComponentRegistry, ServiceType + +from .common import * + +logger = CustomLogger("opea_faqgen_tgi") +logflag = os.getenv("LOGFLAG", False) + + +@OpeaComponentRegistry.register("OPEAFAQGen_TGI") +class OPEAFAQGen_TGI(OPEAFAQGen): + """A specialized OPEA FAQGen TGI component derived from OPEAFAQGen for interacting with TGI services based on Lanchain HuggingFaceEndpoint API. + + Attributes: + client (TGI): An instance of the TGI client for text generation. + """ + + def check_health(self) -> bool: + """Checks the health of the TGI LLM service. + + Returns: + bool: True if the service is reachable and healthy, False otherwise. + """ + + try: + # response = requests.get(f"{self.llm_endpoint}/health") + + # Will remove after TGI gaudi fix health bug + url = f"{self.llm_endpoint}/generate" + data = {"inputs": "What is Deep Learning?", "parameters": {"max_new_tokens": 17}} + headers = {"Content-Type": "application/json"} + response = requests.post(url=url, json=data, headers=headers) + + if response.status_code == 200: + return True + else: + return False + except Exception as e: + logger.error(e) + logger.error("Health check failed") + return False + + async def invoke(self, input: LLMParamsDoc): + """Invokes the TGI LLM service to generate FAQ output for the provided input. + + Args: + input (LLMParamsDoc): The input text(s). + """ + server_kwargs = {} + if self.access_token: + server_kwargs["headers"] = {"Authorization": f"Bearer {self.access_token}"} + + self.client = HuggingFaceEndpoint( + endpoint_url=self.llm_endpoint, + max_new_tokens=input.max_tokens, + top_k=input.top_k, + top_p=input.top_p, + typical_p=input.typical_p, + temperature=input.temperature, + repetition_penalty=input.repetition_penalty, + streaming=input.stream, + server_kwargs=server_kwargs, + ) + result = await self.generate(input, self.client) + + return result diff --git a/comps/llms/src/faq-generation/integrations/vllm.py b/comps/llms/src/faq-generation/integrations/vllm.py new file mode 100644 index 0000000000..6e8b696ea6 --- /dev/null +++ b/comps/llms/src/faq-generation/integrations/vllm.py @@ -0,0 +1,65 @@ +# Copyright (C) 2024 Prediction Guard, Inc. +# SPDX-License-Identified: Apache-2.0 + +import os + +import requests +from langchain_community.llms import VLLMOpenAI + +from comps import CustomLogger, GeneratedDoc, LLMParamsDoc, OpeaComponent, OpeaComponentRegistry, ServiceType + +from .common import * + +logger = CustomLogger("opea_faqgen_vllm") +logflag = os.getenv("LOGFLAG", False) + + +@OpeaComponentRegistry.register("OPEAFAQGen_vLLM") +class OPEAFAQGen_vLLM(OPEAFAQGen): + """A specialized OPEA FAQGen vLLM component derived from OPEAFAQGen for interacting with vLLM services based on Lanchain VLLMOpenAI API. + + Attributes: + client (vLLM): An instance of the vLLM client for text generation. + """ + + def check_health(self) -> bool: + """Checks the health of the vLLM LLM service. + + Returns: + bool: True if the service is reachable and healthy, False otherwise. + """ + + try: + response = requests.get(f"{self.llm_endpoint}/health") + if response.status_code == 200: + return True + else: + return False + except Exception as e: + logger.error(e) + logger.error("Health check failed") + return False + + async def invoke(self, input: LLMParamsDoc): + """Invokes the vLLM LLM service to generate FAQ output for the provided input. + + Args: + input (LLMParamsDoc): The input text(s). + """ + headers = {} + if self.access_token: + headers = {"Authorization": f"Bearer {self.access_token}"} + + self.client = VLLMOpenAI( + openai_api_key="EMPTY", + openai_api_base=self.llm_endpoint + "/v1", + model_name=MODEL_NAME, + default_headers=headers, + max_tokens=input.max_tokens, + top_p=input.top_p, + streaming=input.stream, + temperature=input.temperature, + ) + result = await self.generate(input, self.client) + + return result diff --git a/comps/llms/src/faq-generation/opea_faqgen_microservice.py b/comps/llms/src/faq-generation/opea_faqgen_microservice.py new file mode 100644 index 0000000000..e98ca7eb61 --- /dev/null +++ b/comps/llms/src/faq-generation/opea_faqgen_microservice.py @@ -0,0 +1,58 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os +import time + +from integrations.tgi import OPEAFAQGen_TGI +from integrations.vllm import OPEAFAQGen_vLLM + +from comps import ( + CustomLogger, + LLMParamsDoc, + OpeaComponentLoader, + ServiceType, + opea_microservices, + register_microservice, + register_statistics, + statistics_dict, +) + +logger = CustomLogger("llm_faqgen") +logflag = os.getenv("LOGFLAG", False) + +llm_component_name = os.getenv("FAQGen_COMPONENT_NAME", "OPEAFAQGen_TGI") +# Initialize OpeaComponentLoader +loader = OpeaComponentLoader(llm_component_name, description=f"OPEA LLM FAQGen Component: {llm_component_name}") + + +@register_microservice( + name="opea_service@llm_faqgen", + service_type=ServiceType.LLM, + endpoint="/v1/faqgen", + host="0.0.0.0", + port=9000, +) +@register_statistics(names=["opea_service@llm_faqgen"]) +async def llm_generate(input: LLMParamsDoc): + start = time.time() + + # Log the input if logging is enabled + if logflag: + logger.info(input) + + try: + # Use the controller to invoke the active component + response = await loader.invoke(input) + # Record statistics + statistics_dict["opea_service@llm_faqgen"].append_latency(time.time() - start, None) + return response + + except Exception as e: + logger.error(f"Error during FaqGen invocation: {e}") + raise + + +if __name__ == "__main__": + logger.info("OPEA FAQGen Microservice is starting...") + opea_microservices["opea_service@llm_faqgen"].start() diff --git a/comps/llms/faq-generation/tgi/langchain/requirements-runtime.txt b/comps/llms/src/faq-generation/requirements-runtime.txt similarity index 100% rename from comps/llms/faq-generation/tgi/langchain/requirements-runtime.txt rename to comps/llms/src/faq-generation/requirements-runtime.txt diff --git a/comps/llms/faq-generation/tgi/langchain/requirements.txt b/comps/llms/src/faq-generation/requirements.txt similarity index 100% rename from comps/llms/faq-generation/tgi/langchain/requirements.txt rename to comps/llms/src/faq-generation/requirements.txt diff --git a/comps/third_parties/vllm/deployment/docker_compose/build_docker_vllm.sh b/comps/third_parties/vllm/src/build_docker_vllm.sh similarity index 100% rename from comps/third_parties/vllm/deployment/docker_compose/build_docker_vllm.sh rename to comps/third_parties/vllm/src/build_docker_vllm.sh diff --git a/comps/third_parties/vllm/deployment/docker_compose/build_docker_vllm_openvino.sh b/comps/third_parties/vllm/src/build_docker_vllm_openvino.sh similarity index 100% rename from comps/third_parties/vllm/deployment/docker_compose/build_docker_vllm_openvino.sh rename to comps/third_parties/vllm/src/build_docker_vllm_openvino.sh diff --git a/tests/llms/test_llms_faq-generation_langchain_tgi.sh b/tests/llms/test_llms_faq-generation_langchain_tgi.sh new file mode 100644 index 0000000000..6273c8b2a6 --- /dev/null +++ b/tests/llms/test_llms_faq-generation_langchain_tgi.sh @@ -0,0 +1,114 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -x + +WORKPATH=$(dirname "$PWD") +host_ip=$(hostname -I | awk '{print $1}') +LOG_PATH="$WORKPATH/tests" + +function build_docker_images() { + cd $WORKPATH + docker build --no-cache -t opea/llm-faqgen:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/faq-generation/Dockerfile . + if [ $? -ne 0 ]; then + echo "opea/llm-faqgen built fail" + exit 1 + else + echo "opea/llm-faqgen built successful" + fi +} + +function start_service() { + export LLM_ENDPOINT_PORT=5060 + export FAQ_PORT=5061 + export host_ip=${host_ip} + export HUGGINGFACEHUB_API_TOKEN=${HF_TOKEN} # Remember to set HF_TOKEN before invoking this test! + export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}" + export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" + export FAQGen_COMPONENT_NAME="OPEAFAQGen_TGI" + export LOGFLAG=True + + cd $WORKPATH/comps/llms/deployment/docker_compose + docker compose -f faq-generation_tgi.yaml up -d > ${LOG_PATH}/start_services_with_compose.log + + sleep 30s +} + +function validate_services() { + local URL="$1" + local EXPECTED_RESULT="$2" + local SERVICE_NAME="$3" + local DOCKER_NAME="$4" + local INPUT_DATA="$5" + + local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL") + + echo "===========================================" + + if [ "$HTTP_STATUS" -eq 200 ]; then + echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." + + local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log) + + if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then + echo "[ $SERVICE_NAME ] Content is as expected." + else + echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT" + docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log + exit 1 + fi + else + echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log + exit 1 + fi + sleep 1s +} + +function validate_backend_microservices() { + # tgi + validate_services \ + "${host_ip}:${LLM_ENDPOINT_PORT}/generate" \ + "generated_text" \ + "tgi" \ + "tgi-server" \ + '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' + + # faq + validate_services \ + "${host_ip}:${FAQ_PORT}/v1/faqgen" \ + "text" \ + "llm - faqgen" \ + "llm-faqgen-server" \ + '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens": 32}' + + # faq, non-stream + validate_services \ + "${host_ip}:${FAQ_PORT}/v1/faqgen" \ + "text" \ + "FAQGen" \ + "llm-faqgen-server" \ + '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens": 32, "stream":false}' +} + +function stop_docker() { + cd $WORKPATH/comps/llms/deployment/docker_compose + docker compose -f faq-generation_tgi.yaml down +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_backend_microservices + + stop_docker + echo y | docker system prune + +} + +main diff --git a/tests/llms/test_llms_faq-generation_langchain_tgi_on_intel_hpu.sh b/tests/llms/test_llms_faq-generation_langchain_tgi_on_intel_hpu.sh new file mode 100644 index 0000000000..7a1a4fc698 --- /dev/null +++ b/tests/llms/test_llms_faq-generation_langchain_tgi_on_intel_hpu.sh @@ -0,0 +1,114 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -x + +WORKPATH=$(dirname "$PWD") +host_ip=$(hostname -I | awk '{print $1}') +LOG_PATH="$WORKPATH/tests" + +function build_docker_images() { + cd $WORKPATH + docker build --no-cache -t opea/llm-faqgen:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/faq-generation/Dockerfile . + if [ $? -ne 0 ]; then + echo "opea/llm-faqgen built fail" + exit 1 + else + echo "opea/llm-faqgen built successful" + fi +} + +function start_service() { + export LLM_ENDPOINT_PORT=5062 + export FAQ_PORT=5063 + export host_ip=${host_ip} + export HUGGINGFACEHUB_API_TOKEN=${HF_TOKEN} # Remember to set HF_TOKEN before invoking this test! + export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}" + export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" + export FAQGen_COMPONENT_NAME="OPEAFAQGen_TGI" + export LOGFLAG=True + + cd $WORKPATH/comps/llms/deployment/docker_compose + docker compose -f faq-generation_tgi_on_intel_hpu.yaml up -d > ${LOG_PATH}/start_services_with_compose.log + + sleep 30s +} + +function validate_services() { + local URL="$1" + local EXPECTED_RESULT="$2" + local SERVICE_NAME="$3" + local DOCKER_NAME="$4" + local INPUT_DATA="$5" + + local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL") + + echo "===========================================" + + if [ "$HTTP_STATUS" -eq 200 ]; then + echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." + + local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log) + + if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then + echo "[ $SERVICE_NAME ] Content is as expected." + else + echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT" + docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log + exit 1 + fi + else + echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log + exit 1 + fi + sleep 1s +} + +function validate_backend_microservices() { + # tgi + validate_services \ + "${host_ip}:${LLM_ENDPOINT_PORT}/generate" \ + "generated_text" \ + "tgi" \ + "tgi-gaudi-server" \ + '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' + + # faq + validate_services \ + "${host_ip}:${FAQ_PORT}/v1/faqgen" \ + "text" \ + "llm - faqgen" \ + "llm-faqgen-server" \ + '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens": 32}' + + # faq, non-stream + validate_services \ + "${host_ip}:${FAQ_PORT}/v1/faqgen" \ + "text" \ + "FAQGen" \ + "llm-faqgen-server" \ + '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens": 32, "stream":false}' +} + +function stop_docker() { + cd $WORKPATH/comps/llms/deployment/docker_compose + docker compose -f faq-generation_tgi_on_intel_hpu.yaml down +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_backend_microservices + + stop_docker + echo y | docker system prune + +} + +main diff --git a/tests/llms/test_llms_faq-generation_langchain_vllm_on_intel_hpu.sh b/tests/llms/test_llms_faq-generation_langchain_vllm_on_intel_hpu.sh new file mode 100644 index 0000000000..37d3be22dc --- /dev/null +++ b/tests/llms/test_llms_faq-generation_langchain_vllm_on_intel_hpu.sh @@ -0,0 +1,128 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -x + +WORKPATH=$(dirname "$PWD") +host_ip=$(hostname -I | awk '{print $1}') +LOG_PATH="$WORKPATH/tests" + +function build_docker_images() { + cd $WORKPATH + git clone https://github.com/HabanaAI/vllm-fork.git + cd vllm-fork/ + git checkout 3c39626 + docker build --no-cache -f Dockerfile.hpu -t opea/vllm-gaudi:latest --shm-size=128g . + if [ $? -ne 0 ]; then + echo "opea/vllm-gaudi built fail" + exit 1 + else + echo "opea/vllm-gaudi built successful" + fi + + cd $WORKPATH + docker build --no-cache -t opea/llm-faqgen:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/faq-generation/Dockerfile . + if [ $? -ne 0 ]; then + echo "opea/llm-faqgen built fail" + exit 1 + else + echo "opea/llm-faqgen built successful" + fi + +} + +function start_service() { + export LLM_ENDPOINT_PORT=5066 + export FAQ_PORT=5067 + export host_ip=${host_ip} + export HUGGINGFACEHUB_API_TOKEN=${HF_TOKEN} # Remember to set HF_TOKEN before invoking this test! + export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}" + export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" + export FAQGen_COMPONENT_NAME="OPEAFAQGen_vLLM" + export VLLM_SKIP_WARMUP=true + export LOGFLAG=True + + cd $WORKPATH/comps/llms/deployment/docker_compose + docker compose -f faq-generation_vllm_on_intel_hpu.yaml up -d > ${LOG_PATH}/start_services_with_compose.log + + sleep 30s +} + +function validate_services() { + local URL="$1" + local EXPECTED_RESULT="$2" + local SERVICE_NAME="$3" + local DOCKER_NAME="$4" + local INPUT_DATA="$5" + + local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL") + + echo "===========================================" + + if [ "$HTTP_STATUS" -eq 200 ]; then + echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." + + local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log) + + if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then + echo "[ $SERVICE_NAME ] Content is as expected." + else + echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT" + docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log + exit 1 + fi + else + echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log + exit 1 + fi + sleep 1s +} + +function validate_backend_microservices() { + # vllm + validate_services \ + "${host_ip}:${LLM_ENDPOINT_PORT}/v1/completions" \ + "text" \ + "vllm" \ + "vllm-gaudi-server" \ + '{"model": "Intel/neural-chat-7b-v3-3", "prompt": "What is Deep Learning?", "max_tokens": 32, "temperature": 0}' + + # faq + validate_services \ + "${host_ip}:${FAQ_PORT}/v1/faqgen" \ + "text" \ + "llm - faqgen" \ + "llm-faqgen-server" \ + '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens": 32}' + + # faq, non-stream + validate_services \ + "${host_ip}:${FAQ_PORT}/v1/faqgen" \ + "text" \ + "FAQGen" \ + "llm-faqgen-server" \ + '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens": 32, "stream":false}' +} + +function stop_docker() { + cd $WORKPATH/comps/llms/deployment/docker_compose + docker compose -f faq-generation_vllm_on_intel_hpu.yaml down +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_backend_microservices + + stop_docker + echo y | docker system prune + +} + +main diff --git a/tests/llms/test_llms_faq-generation_tgi_langchain.sh b/tests/llms/test_llms_faq-generation_tgi_langchain.sh deleted file mode 100644 index aea0631eeb..0000000000 --- a/tests/llms/test_llms_faq-generation_tgi_langchain.sh +++ /dev/null @@ -1,94 +0,0 @@ -#!/bin/bash -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -set -x - -WORKPATH=$(dirname "$PWD") -ip_address=$(hostname -I | awk '{print $1}') -LOG_PATH="$WORKPATH/tests" - -function build_docker_images() { - cd $WORKPATH - docker build --no-cache -t opea/llm-faqgen-tgi:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/faq-generation/tgi/langchain/Dockerfile . - if [ $? -ne 0 ]; then - echo "opea/llm-faqgen-tgi built fail" - exit 1 - else - echo "opea/llm-faqgen-tgi built successful" - fi -} - -function start_service() { - tgi_endpoint_port=5073 - export your_hf_llm_model="Intel/neural-chat-7b-v3-3" - # Remember to set HF_TOKEN before invoking this test! - export HF_TOKEN=${HF_TOKEN} - docker run -d --name="test-comps-llm-faq-tgi-endpoint" -p $tgi_endpoint_port:80 -v ./data:/data -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 1g ghcr.io/huggingface/text-generation-inference:1.4 --model-id ${your_hf_llm_model} - export TGI_LLM_ENDPOINT="http://${ip_address}:${tgi_endpoint_port}" - - faq_port=5074 - docker run -d --name="test-comps-llm-faq-tgi-server" -p ${faq_port}:9000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e TGI_LLM_ENDPOINT=$TGI_LLM_ENDPOINT -e HUGGINGFACEHUB_API_TOKEN=$HF_TOKEN opea/llm-faqgen-tgi:comps - - # check whether tgi is fully ready - n=0 - until [[ "$n" -ge 200 ]] || [[ $ready == true ]]; do - docker logs test-comps-llm-faq-tgi-endpoint > ${LOG_PATH}/test-comps-llm-faq-tgi-endpoint.log - n=$((n+1)) - if grep -q Connected ${LOG_PATH}/test-comps-llm-faq-tgi-endpoint.log; then - break - fi - sleep 5s - done - sleep 5s -} - -function validate_microservice() { - faq_port=5074 - http_proxy="" curl http://${ip_address}:${faq_port}/v1/faqgen \ - -X POST \ - -d '{"query":"Deep learning is a subset of machine learning that utilizes neural networks with multiple layers to analyze various levels of abstract data representations. It enables computers to identify patterns and make decisions with minimal human intervention by learning from large amounts of data."}' \ - -H 'Content-Type: application/json' - docker logs test-comps-llm-faq-tgi-endpoint - - URL="http://${ip_address}:$faq_port/v1/faqgen" - HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d '{"query": "What is Deep Learning?"}' -H 'Content-Type: application/json' "$URL") - if [ "$HTTP_STATUS" -eq 200 ]; then - echo "[ llm - faqgen ] HTTP status is 200. Checking content..." - local CONTENT=$(curl -s -X POST -d '{"query": "What is Deep Learning?"}' -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/llm_faqgen.log) - - if echo 'text: ' | grep -q "$EXPECTED_RESULT"; then - echo "[ llm - faqgen ] Content is as expected." - docker logs test-comps-llm-faq-tgi-server >> ${LOG_PATH}/llm_faqgen.log - else - echo "[ llm - faqgen ] Content does not match the expected result: $CONTENT" - docker logs test-comps-llm-faq-tgi-server >> ${LOG_PATH}/llm_faqgen.log - exit 1 - fi - else - echo "[ llm - faqgen ] HTTP status is not 200. Received status was $HTTP_STATUS" - docker logs test-comps-llm-faq-tgi-server >> ${LOG_PATH}/llm_faqgen.log - exit 1 - fi -} - -function stop_docker() { - cid=$(docker ps -aq --filter "name=test-comps-llm-faq*") - if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi -} - -function main() { - - stop_docker - - build_docker_images - start_service - - validate_microservice - - stop_docker - echo y | docker system prune - -} - -main