From 98c62a016d2ef0b9c325813438d8511a1e7e8243 Mon Sep 17 00:00:00 2001 From: lvliang-intel Date: Wed, 11 Sep 2024 14:51:38 +0800 Subject: [PATCH] Move neuralspeed embedding rerank and vllm-xft to catalog (#647) * Move neuralspeed embedding rerank and vllm-xft to catalog Signed-off-by: lvliang-intel * delete xft code Signed-off-by: lvliang-intel * update d yaml Signed-off-by: lvliang-intel --------- Signed-off-by: lvliang-intel Co-authored-by: chen, suyue --- .../docker/compose/reranks-compose-cd.yaml | 8 -- comps/embeddings/neural-speed/Dockerfile | 30 ------ comps/embeddings/neural-speed/README.md | 37 ------- comps/embeddings/neural-speed/__init__.py | 2 - .../neural-speed/dependency/Dockerfile | 26 ----- .../neural-speed/dependency/client.py | 31 ------ .../dependency/client_multibatch.py | 40 -------- .../neural-speed/dependency/requirements.txt | 16 --- .../neural-speed/dependency/server.py | 81 ---------------- .../docker_compose_embedding.yaml | 21 ---- .../neural-speed/embedding_neuralspeed_svc.py | 83 ---------------- .../embeddings/neural-speed/requirements.txt | 11 --- .../llms/text-generation/vllm/xft/Dockerfile | 97 ------------------- comps/llms/text-generation/vllm/xft/README.md | 49 ---------- comps/llms/text-generation/vllm/xft/llm.py | 60 ------------ .../text-generation/vllm/xft/requirements.txt | 10 -- comps/llms/text-generation/vllm/xft/run.sh | 28 ------ comps/reranks/neural-speed/README.md | 32 ------ comps/reranks/neural-speed/__init__.py | 2 - comps/reranks/neural-speed/docker/Dockerfile | 31 ------ .../docker/docker_compose_embedding.yaml | 22 ----- .../neuralspeed-docker/Dockerfile | 27 ------ .../neural-speed/neuralspeed-docker/client.py | 35 ------- .../neuralspeed-docker/client_multibatch.py | 45 --------- .../neuralspeed-docker/requirements.txt | 16 --- .../neural-speed/neuralspeed-docker/server.py | 91 ----------------- comps/reranks/neural-speed/requirements.txt | 11 --- .../neural-speed/reranking_neuralspeed_svc.py | 93 ------------------ tests/test_reranks_mosec-neuralspeed.sh | 84 ---------------- 29 files changed, 1119 deletions(-) delete mode 100644 comps/embeddings/neural-speed/Dockerfile delete mode 100644 comps/embeddings/neural-speed/README.md delete mode 100644 comps/embeddings/neural-speed/__init__.py delete mode 100644 comps/embeddings/neural-speed/dependency/Dockerfile delete mode 100644 comps/embeddings/neural-speed/dependency/client.py delete mode 100644 comps/embeddings/neural-speed/dependency/client_multibatch.py delete mode 100644 comps/embeddings/neural-speed/dependency/requirements.txt delete mode 100644 comps/embeddings/neural-speed/dependency/server.py delete mode 100644 comps/embeddings/neural-speed/docker_compose_embedding.yaml delete mode 100644 comps/embeddings/neural-speed/embedding_neuralspeed_svc.py delete mode 100644 comps/embeddings/neural-speed/requirements.txt delete mode 100644 comps/llms/text-generation/vllm/xft/Dockerfile delete mode 100644 comps/llms/text-generation/vllm/xft/README.md delete mode 100644 comps/llms/text-generation/vllm/xft/llm.py delete mode 100644 comps/llms/text-generation/vllm/xft/requirements.txt delete mode 100644 comps/llms/text-generation/vllm/xft/run.sh delete mode 100644 comps/reranks/neural-speed/README.md delete mode 100644 comps/reranks/neural-speed/__init__.py delete mode 100644 comps/reranks/neural-speed/docker/Dockerfile delete mode 100644 comps/reranks/neural-speed/docker/docker_compose_embedding.yaml delete mode 100644 comps/reranks/neural-speed/neuralspeed-docker/Dockerfile delete mode 100644 comps/reranks/neural-speed/neuralspeed-docker/client.py delete mode 100644 comps/reranks/neural-speed/neuralspeed-docker/client_multibatch.py delete mode 100644 comps/reranks/neural-speed/neuralspeed-docker/requirements.txt delete mode 100644 comps/reranks/neural-speed/neuralspeed-docker/server.py delete mode 100644 comps/reranks/neural-speed/requirements.txt delete mode 100644 comps/reranks/neural-speed/reranking_neuralspeed_svc.py delete mode 100644 tests/test_reranks_mosec-neuralspeed.sh diff --git a/.github/workflows/docker/compose/reranks-compose-cd.yaml b/.github/workflows/docker/compose/reranks-compose-cd.yaml index 1f468f8e5..3e5e7caab 100644 --- a/.github/workflows/docker/compose/reranks-compose-cd.yaml +++ b/.github/workflows/docker/compose/reranks-compose-cd.yaml @@ -14,11 +14,3 @@ services: build: dockerfile: comps/reranks/mosec/langchain/Dockerfile image: ${REGISTRY:-opea}/reranking-langchain-mosec:${TAG:-latest} - reranking-mosec-neural-speed: - build: - dockerfile: comps/reranks/neural-speed/docker/Dockerfile - image: ${REGISTRY:-opea}/reranking-mosec-neural-speed:${TAG:-latest} - reranking-mosec-neural-speed-endpoint: - build: - dockerfile: comps/reranks/neural-speed/neuralspeed-docker/Dockerfile - image: ${REGISTRY:-opea}/reranking-mosec-neural-speed-endpoint:${TAG:-latest} diff --git a/comps/embeddings/neural-speed/Dockerfile b/comps/embeddings/neural-speed/Dockerfile deleted file mode 100644 index 3b495ad54..000000000 --- a/comps/embeddings/neural-speed/Dockerfile +++ /dev/null @@ -1,30 +0,0 @@ - -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -FROM langchain/langchain:latest - -RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ - libgl1-mesa-glx \ - libjemalloc-dev \ - vim - -RUN useradd -m -s /bin/bash user && \ - mkdir -p /home/user && \ - chown -R user /home/user/ - -USER user - -COPY comps /home/user/comps - -RUN pip install --no-cache-dir --upgrade pip && \ - pip install --no-cache-dir -r /home/user/comps/embeddings/neural-speed/requirements.txt - -RUN pip3 install llmspec mosec msgspec httpx requests - -ENV PYTHONPATH=$PYTHONPATH:/home/user - -WORKDIR /home/user/comps/embeddings/neural-speed - -ENTRYPOINT ["python", "embedding_neuralspeed_svc.py"] - diff --git a/comps/embeddings/neural-speed/README.md b/comps/embeddings/neural-speed/README.md deleted file mode 100644 index 17450da28..000000000 --- a/comps/embeddings/neural-speed/README.md +++ /dev/null @@ -1,37 +0,0 @@ -# Embedding Neural Speed - -## build Mosec endpoint docker image - -``` -docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy -t langchain-mosec:neuralspeed -f comps/embeddings/neural-speed/server/docker/Dockerfile . -``` - -## build embedding microservice docker image - -``` -docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy -t opea/embedding-langchain-mosec:neuralspeed -f comps/embeddings/neural-speed/Dockerfile . -``` - -Note: Please contact us to request model files before building images. - -## launch Mosec endpoint docker container - -``` -docker run -d --name="embedding-langchain-mosec-endpoint" -p 6001:8000 langchain-mosec:neuralspeed -``` - -## launch embedding microservice docker container - -``` -export MOSEC_EMBEDDING_ENDPOINT=http://{mosec_embedding_host_ip}:6001 -docker run -d --name="embedding-langchain-mosec-server" -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p 6000:6000 --ipc=host -e MOSEC_EMBEDDING_ENDPOINT=$MOSEC_EMBEDDING_ENDPOINT opea/embedding-langchain-mosec:neuralspeed -``` - -## run client test - -``` -curl localhost:6000/v1/embeddings \ - -X POST \ - -d '{"text":"Hello, world!"}' \ - -H 'Content-Type: application/json' -``` diff --git a/comps/embeddings/neural-speed/__init__.py b/comps/embeddings/neural-speed/__init__.py deleted file mode 100644 index 916f3a44b..000000000 --- a/comps/embeddings/neural-speed/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/embeddings/neural-speed/dependency/Dockerfile b/comps/embeddings/neural-speed/dependency/Dockerfile deleted file mode 100644 index d10383d2a..000000000 --- a/comps/embeddings/neural-speed/dependency/Dockerfile +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -From ubuntu:22.04 -ARG DEBIAN_FRONTEND=noninteractive - -ENV GLIBC_TUNABLES glibc.cpu.x86_shstk=permissive - -COPY comps /root/comps -COPY neural_speed-0.1.dev117+gafc0030.d20240815-cp310-cp310-linux_x86_64.whl /root/ -COPY bge-base-q8.bin /root/ - -RUN apt update && apt install -y python3 python3-pip -RUN pip3 install -r /root/comps/embeddings/neural-speed/server/requirements.txt -RUN pip3 install llmspec mosec msgspec httpx requests -RUN pip3 install /root/neural_speed-0.1.dev117+gafc0030.d20240815-cp310-cp310-linux_x86_64.whl - -RUN cd /root/ && export HF_ENDPOINT=https://hf-mirror.com && huggingface-cli download --resume-download BAAI/bge-base-en-v1.5 --local-dir /root/bge-base-en-v1.5 - - -ENV LD_PRELOAD=/root/libstdc++.so.6 - - -WORKDIR /root/comps/embeddings/neural-speed/server - -CMD ["python3", "server.py"] diff --git a/comps/embeddings/neural-speed/dependency/client.py b/comps/embeddings/neural-speed/dependency/client.py deleted file mode 100644 index cd718ca5e..000000000 --- a/comps/embeddings/neural-speed/dependency/client.py +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import os -from http import HTTPStatus - -import httpx -import msgspec -import requests - -input_text = "what a nice day" -req = { - "query": input_text, -} - -httpx_response = httpx.post("http://127.0.0.1:6001/inference", content=msgspec.msgpack.encode(req)) - -requests_response = requests.post("http://127.0.0.1:6001/inference", data=msgspec.msgpack.encode(req)) - -MOSEC_EMBEDDING_ENDPOINT = os.environ.get("MOSEC_EMBEDDING_ENDPOINT", "http://127.0.0.1:6001") - -request_url = MOSEC_EMBEDDING_ENDPOINT + "/inference" -print(f"request_url = {request_url}") -resp_3 = requests.post(request_url, data=msgspec.msgpack.encode(req)) - -if httpx_response.status_code == HTTPStatus.OK and requests_response.status_code == HTTPStatus.OK: - print(f"OK: \n {msgspec.msgpack.decode(httpx_response.content)}") - print(f"OK: \n {msgspec.msgpack.decode(requests_response.content)}") - print(f"OK: \n {msgspec.msgpack.decode(resp_3.content)}") -else: - print(f"err[{httpx_response.status_code}] {httpx_response.text}") diff --git a/comps/embeddings/neural-speed/dependency/client_multibatch.py b/comps/embeddings/neural-speed/dependency/client_multibatch.py deleted file mode 100644 index ed49b6322..000000000 --- a/comps/embeddings/neural-speed/dependency/client_multibatch.py +++ /dev/null @@ -1,40 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -from http import HTTPStatus -from threading import Thread - -import httpx -import msgspec - -req = { - "query": "Return the ‘thread identifier’ of the current thread. This is a nonzero integer. Its value has no direct meaning; it is intended as a magic cookie to be used e.g. to index a dictionary of thread-specific data. Thread identifiers may be recycled when a thread exits and another thread is created.", -} -reqs = [] -BATCH = 32 -for i in range(BATCH): - reqs.append(msgspec.msgpack.encode(req)) - - -def post_func(threadIdx): - resp = httpx.post("http://127.0.0.1:6001/inference", content=reqs[threadIdx]) - ret = f"thread {threadIdx} \n" - if resp.status_code == HTTPStatus.OK: - ret += f"OK: {msgspec.msgpack.decode(resp.content)['embeddings'][:16]}" - else: - ret += f"err[{resp.status_code}] {resp.text}" - print(ret) - - -threads = [] -for i in range(BATCH): - t = Thread( - target=post_func, - args=[ - i, - ], - ) - threads.append(t) - -for i in range(BATCH): - threads[i].start() diff --git a/comps/embeddings/neural-speed/dependency/requirements.txt b/comps/embeddings/neural-speed/dependency/requirements.txt deleted file mode 100644 index 50dc540fc..000000000 --- a/comps/embeddings/neural-speed/dependency/requirements.txt +++ /dev/null @@ -1,16 +0,0 @@ ---extra-index-url https://download.pytorch.org/whl/cpu -accelerate -cmake -datasets -huggingface_hub -matplotlib -numpy -peft -protobuf<3.20 -py-cpuinfo -sentencepiece -tiktoken -torch -transformers -transformers_stream_generator -zipfile38 diff --git a/comps/embeddings/neural-speed/dependency/server.py b/comps/embeddings/neural-speed/dependency/server.py deleted file mode 100644 index b47259968..000000000 --- a/comps/embeddings/neural-speed/dependency/server.py +++ /dev/null @@ -1,81 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import time -from typing import Any, List - -import numpy -from mosec import Server, Worker, get_logger -from mosec.mixin import TypedMsgPackMixin -from msgspec import Struct -from neural_speed import Model -from transformers import AutoTokenizer - -logger = get_logger() - -INFERENCE_BATCH_SIZE = 32 -INFERENCE_MAX_WAIT_TIME = 30 -INFERENCE_WORKER_NUM = 1 -INFERENCE_CONTEXT = 512 - -TorchModel = "/root/bge-base-en-v1.5" -NS_Bin = "/root/bge-base-q8.bin" - -NS_Model = "bert" - - -class Request(Struct, kw_only=True): - query: str - - -class Response(Struct, kw_only=True): - embeddings: List[float] - - -class Inference(TypedMsgPackMixin, Worker): - - def __init__(self): - super().__init__() - self.tokenizer = AutoTokenizer.from_pretrained(TorchModel) - self.model = Model() - self.model.init_from_bin( - NS_Model, - NS_Bin, - batch_size=INFERENCE_BATCH_SIZE, - n_ctx=INFERENCE_CONTEXT + 2, - ) - - def forward(self, data: List[Request]) -> List[Response]: - batch = len(data) - sequences = [d.query for d in data] - inputs = self.tokenizer( - sequences, - padding=True, - truncation=True, - max_length=INFERENCE_CONTEXT, - return_tensors="pt", - ) - st = time.time() - ns_outputs = self.model( - **inputs, - reinit=True, - logits_all=True, - continuous_batching=False, - ignore_padding=True, - ) - logger.info(f"batch {batch} input shape {inputs.input_ids.shape} time {time.time()-st}") - ns_outputs = ns_outputs[:, 0] - ns_outputs = ns_outputs / numpy.linalg.norm(ns_outputs, axis=1, keepdims=True) - resps = [] - for i in range(batch): - resp = Response(embeddings=ns_outputs[i].tolist()) - resps.append(resp) - return resps - - -if __name__ == "__main__": - server = Server() - server.append_worker( - Inference, max_batch_size=INFERENCE_BATCH_SIZE, max_wait_time=INFERENCE_MAX_WAIT_TIME, num=INFERENCE_WORKER_NUM - ) - server.run() diff --git a/comps/embeddings/neural-speed/docker_compose_embedding.yaml b/comps/embeddings/neural-speed/docker_compose_embedding.yaml deleted file mode 100644 index 2e00eeca2..000000000 --- a/comps/embeddings/neural-speed/docker_compose_embedding.yaml +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -version: "3.8" - -services: - embedding: - image: opea/embedding-langchain-mosec:neuralspeed - container_name: embedding-langchain-mosec-server - ports: - - "6000:6000" - ipc: host - environment: - http_proxy: ${http_proxy} - https_proxy: ${https_proxy} - MOSEC_EMBEDDING_ENDPOINT: ${MOSEC_EMBEDDING_ENDPOINT} - restart: unless-stopped - -networks: - default: - driver: bridge diff --git a/comps/embeddings/neural-speed/embedding_neuralspeed_svc.py b/comps/embeddings/neural-speed/embedding_neuralspeed_svc.py deleted file mode 100644 index ca2d27d5f..000000000 --- a/comps/embeddings/neural-speed/embedding_neuralspeed_svc.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import os -import time -from typing import List, Optional - -import httpx -import msgspec -import requests -from langchain_community.embeddings import OpenAIEmbeddings -from langsmith import traceable - -from comps import ( - EmbedDoc, - ServiceType, - TextDoc, - opea_microservices, - register_microservice, - register_statistics, - statistics_dict, -) - - -class MosecEmbeddings(OpenAIEmbeddings): - - def _get_len_safe_embeddings( - self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None - ) -> List[List[float]]: - _chunk_size = chunk_size or self.chunk_size - batched_embeddings: List[List[float]] = [] - response = self.client.create(input=texts, **self._invocation_params) - if not isinstance(response, dict): - response = response.model_dump() - batched_embeddings.extend(r["embedding"] for r in response["data"]) - - _cached_empty_embedding: Optional[List[float]] = None - - def empty_embedding() -> List[float]: - nonlocal _cached_empty_embedding - if _cached_empty_embedding is None: - average_embedded = self.client.create(input="", **self._invocation_params) - if not isinstance(average_embedded, dict): - average_embedded = average_embedded.model_dump() - _cached_empty_embedding = average_embedded["data"][0]["embedding"] - return _cached_empty_embedding - - return [e if e is not None else empty_embedding() for e in batched_embeddings] - - -@register_microservice( - name="opea_service@embedding_mosec", - service_type=ServiceType.EMBEDDING, - endpoint="/v1/embeddings", - host="0.0.0.0", - port=6000, - input_datatype=TextDoc, - output_datatype=EmbedDoc, -) -@traceable(run_type="embedding") -@register_statistics(names=["opea_service@embedding_mosec"]) -def embedding(input: TextDoc) -> EmbedDoc: - start = time.time() - req = { - "query": input.text, - } - request_url = MOSEC_EMBEDDING_ENDPOINT + "/inference" - resp = requests.post(request_url, data=msgspec.msgpack.encode(req)) - - embed_vector = msgspec.msgpack.decode(resp.content)["embeddings"] - res = EmbedDoc(text=req["query"][0], embedding=embed_vector) - statistics_dict["opea_service@embedding_mosec"].append_latency(time.time() - start, None) - return res - - -if __name__ == "__main__": - MOSEC_EMBEDDING_ENDPOINT = os.environ.get("MOSEC_EMBEDDING_ENDPOINT", "http://127.0.0.1:6001") - os.environ["OPENAI_API_BASE"] = MOSEC_EMBEDDING_ENDPOINT - os.environ["OPENAI_API_KEY"] = "Dummy key" - MODEL_ID = os.environ.get("MODEL_ID", "BAAI/bge-base-en-v1.5") - embeddings = MosecEmbeddings(model=MODEL_ID) - print("NeuralSpeed Embedding Microservice Initialized.") - opea_microservices["opea_service@embedding_mosec"].start() diff --git a/comps/embeddings/neural-speed/requirements.txt b/comps/embeddings/neural-speed/requirements.txt deleted file mode 100644 index 9fa1a059c..000000000 --- a/comps/embeddings/neural-speed/requirements.txt +++ /dev/null @@ -1,11 +0,0 @@ -docarray[full] -fastapi -langchain -langchain_community -openai -opentelemetry-api -opentelemetry-exporter-otlp -opentelemetry-sdk -prometheus-fastapi-instrumentator -shortuuid -uvicorn diff --git a/comps/llms/text-generation/vllm/xft/Dockerfile b/comps/llms/text-generation/vllm/xft/Dockerfile deleted file mode 100644 index 4b7d7d342..000000000 --- a/comps/llms/text-generation/vllm/xft/Dockerfile +++ /dev/null @@ -1,97 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -FROM ubuntu:22.04 - -ARG TAG=main - -RUN apt-get update \ - && apt-get upgrade -y \ - && apt-get install -y --no-install-recommends \ - gcc-12 \ - g++-12 \ - make \ - wget \ - libnuma-dev \ - numactl \ - git \ - pkg-config \ - software-properties-common \ - zlib1g-dev \ - libssl-dev \ - libffi-dev \ - libbz2-dev \ - libsqlite3-dev \ - && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 60 \ - && update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-12 60 \ - && apt-get autoremove -y \ - && rm -rf /var/lib/apt/lists/* - -# Install python -WORKDIR /tmp -RUN wget -q https://www.python.org/ftp/python/3.8.10/Python-3.8.10.tgz \ - && tar -xzvf Python-3.8.10.tgz -WORKDIR /tmp/Python-3.8.10 -RUN ./configure --prefix=/usr/bin/python3.8 --enable-optimizations \ - && make -j \ - && make install \ - && update-alternatives --install /usr/bin/python python /usr/bin/python3.8/bin/python3.8 60 \ - && update-alternatives --install /usr/bin/pip pip /usr/bin/python3.8/bin/pip3 60 \ - && python -m pip install --no-cache-dir --upgrade pip setuptools \ - && pip install --no-cache-dir wheel \ - && rm -rf /tmp/* \ - && echo "export PATH=/usr/bin/python3.8:\$PATH" >> ~/.bashrc - -RUN pip install --no-cache-dir torch==2.3.0+cpu --index-url https://download.pytorch.org/whl/cpu -RUN pip install --no-cache-dir cmake==3.26.1 transformers==4.41.2 sentencepiece==0.1.99 accelerate==0.23.0 protobuf tiktoken transformers-stream-generator einops \ - && ln -s /usr/bin/python3.8/lib/python3.8/site-packages/cmake/data/bin/cmake /usr/bin/cmake - -# Install oneCCL -RUN git clone https://github.com/oneapi-src/oneCCL.git /tmp/oneCCL -WORKDIR /tmp/oneCCL -RUN git checkout 2021.10 \ - && sed -i 's/cpu_gpu_dpcpp/./g' cmake/templates/oneCCLConfig.cmake.in \ - && mkdir build -WORKDIR /tmp/oneCCL/build -RUN cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local/oneCCL \ - && make -j install - -RUN echo "source /usr/local/oneCCL/env/setvars.sh" >> ~/.bashrc - -WORKDIR /home/user/ -RUN rm -rf /tmp/oneCCL - -RUN git clone https://github.com/intel/xFasterTransformer.git - -SHELL ["/bin/bash", "-c"] -WORKDIR /home/user/xFasterTransformer -RUN git checkout ${TAG} \ - && export "LD_LIBRARY_PATH=/usr/local/mklml_lnx_2019.0.5.20190502/lib:$LD_LIBRARY_PATH" \ - && export "PATH=/usr/bin/python3.8:$PATH" \ - && echo "source /usr/local/oneCCL/env/setvars.sh" >> ~/.bash_profile \ - && source ~/.bash_profile \ - && python setup.py build \ - && python setup.py egg_info bdist_wheel --verbose \ - && pip install --no-cache-dir dist/* - -RUN mkdir -p /usr/local/xft/lib \ - && cp /home/user/xFasterTransformer/build/libxfastertransformer.so /usr/local/xft/lib \ - && cp /home/user/xFasterTransformer/build/libxft_comm_helper.so /usr/local/xft/lib \ - && cp -r /home/user/xFasterTransformer/include /usr/local/xft/ \ - && mkdir -p /usr/local/include/xft/ \ - && ln -s /usr/local/xft/include /usr/local/include/xft/include - -RUN echo "export \$(python -c 'import xfastertransformer as xft; print(xft.get_env())')" >> ~/.bashrc - -COPY comps /home/user/comps - -RUN pip install --no-cache-dir --upgrade pip && \ - pip install --no-cache-dir -r /home/user/comps/llms/text-generation/vllm/xft/requirements.txt - -ENV PYTHONPATH=$PYTHONPATH:/root - -RUN chmod +x /home/user/comps/llms/text-generation/vllm/xft/run.sh - -WORKDIR /home/user/comps/llms/text-generation/vllm/xft/ - -ENTRYPOINT ["/home/user/comps/llms/text-generation/vllm/xft/run.sh"] diff --git a/comps/llms/text-generation/vllm/xft/README.md b/comps/llms/text-generation/vllm/xft/README.md deleted file mode 100644 index 4a2bdf68e..000000000 --- a/comps/llms/text-generation/vllm/xft/README.md +++ /dev/null @@ -1,49 +0,0 @@ -# vLLM-xFT - -vLLM-xFT is a fork of vLLM to integrate the xfastertransformer backend, maintaining compatibility with most of the official vLLM's features. -For usage of vllm-xFT, please refer to [xFasterTransformer/vllm-xft](https://github.com/intel/xFasterTransformer/blob/main/serving/vllm-xft.md) - -## 🚀 Start Microservice with Docker - -### 1 Build Docker Image - -```bash -cd ../../../ -docker build -t opea/llm-vllm-xft:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/text-generation/vllm/xft/Dockerfile . -``` - -### 2 Run Docker with CLI - -```bash -docker run -it -p 9000:9000 -v /home/sdp/Qwen2-7B-Instruct/:/Qwen2-7B-Instruct/ -e vLLM_LLM_ENDPOINT="http://localhost:18688" -e HF_DATASET_DIR="/Qwen2-7B-Instruct/" -e OUTPUT_DIR="./output" -e TOKEN_PATH="/Qwen2-7B-Instruct/" -e https_proxy=$https_proxy -e http_proxy=$http_proxy -e no_proxy=$no_proxy --ipc=host opea/llm-vllm-xft:latest -``` - -## 🚀3. Consume LLM Service - -### 3.1 Check Service Status - -```bash -curl http://${your_ip}:9000/v1/health_check\ - -X GET \ - -H 'Content-Type: application/json' -``` - -### 3.2 Consume LLM Service - -You can set the following model parameters according to your actual needs, such as `max_new_tokens`, `streaming`. - -The `streaming` parameter determines the format of the data returned by the API. It will return text string with `streaming=false`, return text streaming flow with `streaming=true`. - -```bash -# non-streaming mode -curl http://${your_ip}:9000/v1/chat/completions \ - -X POST \ - -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":false}' \ - -H 'Content-Type: application/json' - -# streaming mode -curl http://${your_ip}:9000/v1/chat/completions \ - -X POST \ - -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \ - -H 'Content-Type: application/json' -``` diff --git a/comps/llms/text-generation/vllm/xft/llm.py b/comps/llms/text-generation/vllm/xft/llm.py deleted file mode 100644 index 07d892bde..000000000 --- a/comps/llms/text-generation/vllm/xft/llm.py +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import os - -from fastapi.responses import StreamingResponse -from langchain_community.llms import VLLMOpenAI - -from comps import CustomLogger, GeneratedDoc, LLMParamsDoc, ServiceType, opea_microservices, register_microservice - -logger = CustomLogger("llm_vllm_xft") -logflag = os.getenv("LOGFLAG", False) - - -@register_microservice( - name="opea_service@llm_vllm_xft", - service_type=ServiceType.LLM, - endpoint="/v1/chat/completions", - host="0.0.0.0", - port=9000, -) -def llm_generate(input: LLMParamsDoc): - if logflag: - logger.info(input) - llm_endpoint = os.getenv("vLLM_LLM_ENDPOINT", "http://localhost:18688") - llm = VLLMOpenAI( - openai_api_key="EMPTY", - openai_api_base=llm_endpoint + "/v1", - max_tokens=input.max_new_tokens, - model_name="xft", - top_p=input.top_p, - temperature=input.temperature, - presence_penalty=input.repetition_penalty, - streaming=input.streaming, - ) - - if input.streaming: - - def stream_generator(): - chat_response = "" - for text in llm.stream(input.query): - chat_response += text - chunk_repr = repr(text.encode("utf-8")) - if logflag: - logger.info(f"[llm - chat_stream] chunk:{chunk_repr}") - yield f"data: {chunk_repr}\n\n" - if logflag: - logger.info(f"[llm - chat_stream] stream response: {chat_response}") - yield "data: [DONE]\n\n" - - return StreamingResponse(stream_generator(), media_type="text/event-stream") - else: - response = llm.invoke(input.query) - if logflag: - logger.info(response) - return GeneratedDoc(text=response, prompt=input.query) - - -if __name__ == "__main__": - opea_microservices["opea_service@llm_vllm_xft"].start() diff --git a/comps/llms/text-generation/vllm/xft/requirements.txt b/comps/llms/text-generation/vllm/xft/requirements.txt deleted file mode 100644 index a4accaed2..000000000 --- a/comps/llms/text-generation/vllm/xft/requirements.txt +++ /dev/null @@ -1,10 +0,0 @@ -docarray[full] -fastapi -langchain==0.1.16 -opentelemetry-api -opentelemetry-exporter-otlp -opentelemetry-sdk -prometheus-fastapi-instrumentator -shortuuid -uvicorn -vllm-xft diff --git a/comps/llms/text-generation/vllm/xft/run.sh b/comps/llms/text-generation/vllm/xft/run.sh deleted file mode 100644 index b729760ca..000000000 --- a/comps/llms/text-generation/vllm/xft/run.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/sh - -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -# Preloading libiomp5.so is essential for optimal performance. -# libiomp5.so is the Intel OpenMP runtime library, providing parallel computation support, -# thread management, task scheduling, and performance optimization on Intel X86 platforms. -export $(python -c 'import xfastertransformer as xft; print(xft.get_env())') - -# convert the model to fastertransformer format -python -c 'import os; import xfastertransformer as xft; xft.Qwen2Convert().convert(os.environ["HF_DATASET_DIR"], os.environ["OUTPUT_DIR"])' - -unset http_proxy - -# serving with vllm -python -m vllm.entrypoints.openai.api_server \ - --model ${OUTPUT_DIR} \ - --tokenizer ${TOKEN_PATH} \ - --dtype bf16 \ - --kv-cache-dtype fp16 \ - --served-model-name xft \ - --host 0.0.0.0 \ - --port 18688 \ - --trust-remote-code & - -# run llm microservice wrapper -python llm.py diff --git a/comps/reranks/neural-speed/README.md b/comps/reranks/neural-speed/README.md deleted file mode 100644 index c1841e16a..000000000 --- a/comps/reranks/neural-speed/README.md +++ /dev/null @@ -1,32 +0,0 @@ -# build Mosec endpoint docker image - -``` -docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy -t langchain-mosec:neuralspeed-reranks -f comps/reranks/neural-speed/neuralspeed-docker/Dockerfile . -``` - -# build Reranking microservice docker image - -``` -docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy -t opea/reranking-langchain-mosec:neuralspeed -f comps/reranks/neural-speed/docker/Dockerfile . -``` - -Note: Please contact us to request model files before building images. - -# launch Mosec endpoint docker container - -``` -docker run -d --name="reranking-langchain-mosec-endpoint" -p 6001:8000 langchain-mosec:neuralspeed-reranks -``` - -# launch Reranking microservice docker container - -``` -export MOSEC_RERANKING_ENDPOINT=http://127.0.0.1:6001 -docker run -d --name="reranking-langchain-mosec-server" -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p 6000:8000 --ipc=host -e MOSEC_RERANKING_ENDPOINT=$MOSEC_RERANKING_ENDPOINT opea/reranking-langchain-mosec:neuralspeed -``` - -# run client test - -``` -curl http://localhost:6000/v1/reranking -X POST -d '{"initial_query":"What is Deep Learning?", "retrieved_docs": [{"text":"Deep Learning is not..."}, {"text":"Deep learning is..."}]}' -H 'Content-Type: application/json' -``` diff --git a/comps/reranks/neural-speed/__init__.py b/comps/reranks/neural-speed/__init__.py deleted file mode 100644 index 916f3a44b..000000000 --- a/comps/reranks/neural-speed/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/reranks/neural-speed/docker/Dockerfile b/comps/reranks/neural-speed/docker/Dockerfile deleted file mode 100644 index 8ffed65ec..000000000 --- a/comps/reranks/neural-speed/docker/Dockerfile +++ /dev/null @@ -1,31 +0,0 @@ - -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -FROM langchain/langchain:latest - -RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ - libgl1-mesa-glx \ - libjemalloc-dev \ - vim - -RUN useradd -m -s /bin/bash user && \ - mkdir -p /home/user && \ - chown -R user /home/user/ - -USER user - -COPY comps /home/user/comps - -RUN pip install --no-cache-dir --upgrade pip && \ - pip install --no-cache-dir -r /home/user/comps/reranks/neural-speed/requirements.txt - -RUN pip3 install llmspec mosec msgspec httpx requests -RUN pip3 install torch==2.2.2 --trusted-host download.pytorch.org --index-url https://download.pytorch.org/whl/cpu - -ENV PYTHONPATH=$PYTHONPATH:/home/user - -WORKDIR /home/user/comps/reranks/neural-speed - -ENTRYPOINT ["python", "reranking_neuralspeed_svc.py"] - diff --git a/comps/reranks/neural-speed/docker/docker_compose_embedding.yaml b/comps/reranks/neural-speed/docker/docker_compose_embedding.yaml deleted file mode 100644 index d5f59b4a0..000000000 --- a/comps/reranks/neural-speed/docker/docker_compose_embedding.yaml +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -version: "3.8" - -services: - reranking: - image: opea/reranking-langchain-mosec:neuralspeed - container_name: reranking-langchain-mosec-server - ports: - - "6000:8000" - ipc: host - environment: - http_proxy: ${http_proxy} - https_proxy: ${https_proxy} - MOSEC_RERANKING_ENDPOINT: ${MOSEC_RERANKING_ENDPOINT} - LANGCHAIN_API_KEY: ${LANGCHAIN_API_KEY} - restart: unless-stopped - -networks: - default: - driver: bridge diff --git a/comps/reranks/neural-speed/neuralspeed-docker/Dockerfile b/comps/reranks/neural-speed/neuralspeed-docker/Dockerfile deleted file mode 100644 index 42dcbad8c..000000000 --- a/comps/reranks/neural-speed/neuralspeed-docker/Dockerfile +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -From ubuntu:22.04 -ARG DEBIAN_FRONTEND=noninteractive - -ENV GLIBC_TUNABLES glibc.cpu.x86_shstk=permissive - -COPY comps /root/comps -COPY neural_speed-0.1.dev45+g41ea0aa-cp310-cp310-linux_x86_64.whl /root/ -COPY bge-large-r-q8.bin /root/ -COPY libstdc++.so.6 /root/ - -RUN apt update && apt install -y python3 python3-pip -RUN pip3 install -r /root/comps/reranks/neural-speed/neuralspeed-docker/requirements.txt -RUN pip3 install llmspec mosec msgspec httpx requests -RUN pip3 install /root/neural_speed-0.1.dev45+g41ea0aa-cp310-cp310-linux_x86_64.whl - -RUN cd /root/ && export HF_ENDPOINT=https://hf-mirror.com && huggingface-cli download --resume-download BAAI/bge-reranker-large --local-dir /root/bge-reranker-large - - -ENV LD_PRELOAD=/root/libstdc++.so.6 - - -WORKDIR /root/comps/reranks/neural-speed/neuralspeed-docker - -CMD ["python3", "server.py"] diff --git a/comps/reranks/neural-speed/neuralspeed-docker/client.py b/comps/reranks/neural-speed/neuralspeed-docker/client.py deleted file mode 100644 index 02017faaf..000000000 --- a/comps/reranks/neural-speed/neuralspeed-docker/client.py +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import os -from http import HTTPStatus - -import httpx -import msgspec -import requests - -req = { - "query": "talk is cheap, show me the code", - "docs": [ - "what a nice day", - "life is short, use python", - "early bird catches the worm", - ], -} - -httpx_response = httpx.post("http://127.0.0.1:8080/inference", content=msgspec.msgpack.encode(req)) - -requests_response = requests.post("http://127.0.0.1:8080/inference", data=msgspec.msgpack.encode(req)) - -MOSEC_RERANKING_ENDPOINT = os.environ.get("MOSEC_RERANKING_ENDPOINT", "http://127.0.0.1:8080") - -request_url = MOSEC_RERANKING_ENDPOINT + "/inference" -print(f"request_url = {request_url}") -resp_3 = requests.post(request_url, data=msgspec.msgpack.encode(req)) - -if httpx_response.status_code == HTTPStatus.OK and requests_response.status_code == HTTPStatus.OK: - print(f"OK: \n {msgspec.msgpack.decode(httpx_response.content)}") - print(f"OK: \n {msgspec.msgpack.decode(requests_response.content)}") - print(f"OK: \n {msgspec.msgpack.decode(resp_3.content)}") -else: - print(f"err[{httpx_response.status_code}] {httpx_response.text}") diff --git a/comps/reranks/neural-speed/neuralspeed-docker/client_multibatch.py b/comps/reranks/neural-speed/neuralspeed-docker/client_multibatch.py deleted file mode 100644 index 09eee1dfb..000000000 --- a/comps/reranks/neural-speed/neuralspeed-docker/client_multibatch.py +++ /dev/null @@ -1,45 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -from http import HTTPStatus -from threading import Thread - -import httpx -import msgspec - -req = { - "query": "talk is cheap, show me the code", - "docs": [ - "what a nice day", - "life is short, use python", - "early bird catches the worm", - ], -} -reqs = [] -BATCH = 32 -for i in range(BATCH): - reqs.append(msgspec.msgpack.encode(req)) - - -def post_func(threadIdx): - resp = httpx.post("http://127.0.0.1:8080/inference", content=reqs[threadIdx]) - ret = f"thread {threadIdx} \n" - if resp.status_code == HTTPStatus.OK: - ret += f"OK: {msgspec.msgpack.decode(resp.content)['scores']}" - else: - ret += f"err[{resp.status_code}] {resp.text}" - print(ret) - - -threads = [] -for i in range(BATCH): - t = Thread( - target=post_func, - args=[ - i, - ], - ) - threads.append(t) - -for i in range(BATCH): - threads[i].start() diff --git a/comps/reranks/neural-speed/neuralspeed-docker/requirements.txt b/comps/reranks/neural-speed/neuralspeed-docker/requirements.txt deleted file mode 100644 index 50dc540fc..000000000 --- a/comps/reranks/neural-speed/neuralspeed-docker/requirements.txt +++ /dev/null @@ -1,16 +0,0 @@ ---extra-index-url https://download.pytorch.org/whl/cpu -accelerate -cmake -datasets -huggingface_hub -matplotlib -numpy -peft -protobuf<3.20 -py-cpuinfo -sentencepiece -tiktoken -torch -transformers -transformers_stream_generator -zipfile38 diff --git a/comps/reranks/neural-speed/neuralspeed-docker/server.py b/comps/reranks/neural-speed/neuralspeed-docker/server.py deleted file mode 100644 index 0176abcfb..000000000 --- a/comps/reranks/neural-speed/neuralspeed-docker/server.py +++ /dev/null @@ -1,91 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import os -import time -from typing import Any, List - -import numpy -from mosec import Server, Worker, get_logger -from mosec.mixin import TypedMsgPackMixin -from msgspec import Struct -from neural_speed import Model -from transformers import AutoModelForSequenceClassification, AutoTokenizer - -logger = get_logger() - -INFERENCE_BATCH_SIZE = 128 -INFERENCE_MAX_WAIT_TIME = 10 -INFERENCE_WORKER_NUM = 1 -INFERENCE_CONTEXT = 512 - -TorchModel = "/root/bge-reranker-large" -NS_Bin = "/root/bge-large-r-q8.bin" - -NS_Model = "bert" - - -class Request(Struct, kw_only=True): - query: str - docs: List[str] - - -class Response(Struct, kw_only=True): - scores: List[float] - - -class Inference(TypedMsgPackMixin, Worker): - - def __init__(self): - super().__init__() - self.tokenizer = AutoTokenizer.from_pretrained(TorchModel) - self.model = Model() - self.model.init_from_bin( - NS_Model, - NS_Bin, - batch_size=INFERENCE_BATCH_SIZE, - n_ctx=INFERENCE_CONTEXT + 2, - ) - - def forward(self, data: List[Request]) -> List[Response]: - batch = len(data) - ndoc = [] - inps = [] - for data in data: - inp = [[data.query, doc] for doc in data.docs] - inps.extend(inp) - ndoc.append(len(data.docs)) - outs = [] - for i in range(0, len(inps), INFERENCE_BATCH_SIZE): - inp_bs = inps[i : i + INFERENCE_BATCH_SIZE] - inputs = self.tokenizer( - inp_bs, padding=True, truncation=True, max_length=INFERENCE_CONTEXT, return_tensors="pt" - ) - st = time.time() - output = self.model( - **inputs, - reinit=True, - logits_all=True, - continuous_batching=False, - ignore_padding=True, - ) - logger.info(f"Toal batch {batch} input shape {inputs.input_ids.shape} time {time.time()-st}") - outs.append(output) - ns_outputs = numpy.concatenate(outs, axis=0) - resps = [] - pos = 0 - for i in range(batch): - resp = Response(scores=ns_outputs[pos : pos + ndoc[i]].tolist()) - pos += ndoc[i] - resps.append(resp) - return resps - - -if __name__ == "__main__": - INFERENCE_BATCH_SIZE = int(os.environ.get("MAX_BATCH_SIZE", 128)) - INFERENCE_MAX_WAIT_TIME = int(os.environ.get("MAX_WAIT_TIME", 1)) - server = Server() - server.append_worker( - Inference, max_batch_size=INFERENCE_BATCH_SIZE, max_wait_time=INFERENCE_MAX_WAIT_TIME, num=INFERENCE_WORKER_NUM - ) - server.run() diff --git a/comps/reranks/neural-speed/requirements.txt b/comps/reranks/neural-speed/requirements.txt deleted file mode 100644 index 9fa1a059c..000000000 --- a/comps/reranks/neural-speed/requirements.txt +++ /dev/null @@ -1,11 +0,0 @@ -docarray[full] -fastapi -langchain -langchain_community -openai -opentelemetry-api -opentelemetry-exporter-otlp -opentelemetry-sdk -prometheus-fastapi-instrumentator -shortuuid -uvicorn diff --git a/comps/reranks/neural-speed/reranking_neuralspeed_svc.py b/comps/reranks/neural-speed/reranking_neuralspeed_svc.py deleted file mode 100644 index 098378a52..000000000 --- a/comps/reranks/neural-speed/reranking_neuralspeed_svc.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import heapq -import json -import os -import re -import time -from typing import List, Optional, Union - -import httpx -import msgspec -import requests -import torch -from langchain_core.prompts import ChatPromptTemplate -from langsmith import traceable - -from comps import ( - CustomLogger, - LLMParamsDoc, - SearchedDoc, - ServiceType, - opea_microservices, - register_microservice, - register_statistics, - statistics_dict, -) -from comps.cores.proto.api_protocol import ( - ChatCompletionRequest, - RerankingRequest, - RerankingResponse, - RerankingResponseData, -) - - -@register_microservice( - name="opea_service@reranking_mosec", - service_type=ServiceType.RERANK, - endpoint="/v1/reranking", - host="0.0.0.0", - port=8000, - input_datatype=SearchedDoc, - output_datatype=LLMParamsDoc, -) -@traceable(run_type="reranking") -@register_statistics(names=["opea_service@reranking_mosec"]) -def reranking( - input: Union[SearchedDoc, RerankingRequest, ChatCompletionRequest] -) -> Union[LLMParamsDoc, RerankingResponse, ChatCompletionRequest]: - start = time.time() - reranking_results = [] - if input.retrieved_docs: - docs = [doc.text for doc in input.retrieved_docs] - url = mosec_reranking_endpoint + "/inference" - if isinstance(input, SearchedDoc): - query = input.initial_query - else: - # for RerankingRequest, ChatCompletionRequest - query = input.input - data = {"query": query, "docs": docs} - resp = requests.post(url, data=msgspec.msgpack.encode(data)) - response_list = msgspec.msgpack.decode(resp.content)["scores"] - response = torch.nn.functional.sigmoid(torch.tensor(response_list)) - length = len(response) - resp_list = response.tolist() - sorted_score = heapq.nlargest(length, resp_list) - sorted_score_index = map(resp_list.index, sorted_score) - - for i in range(input.top_n): - reranking_results.append( - {"text": input.retrieved_docs[list(sorted_score_index)[i]].text, "score": sorted_score[i]} - ) - - statistics_dict["opea_service@reranking_mosec"].append_latency(time.time() - start, None) - if isinstance(input, SearchedDoc): - return LLMParamsDoc(query=input.initial_query, documents=[doc["text"] for doc in reranking_results]) - else: - reranking_docs = [] - for doc in reranking_results: - reranking_docs.append(RerankingResponseData(text=doc["text"], score=doc["score"])) - if isinstance(input, RerankingRequest): - return RerankingResponse(reranked_docs=reranking_docs) - - if isinstance(input, ChatCompletionRequest): - input.reranked_docs = reranking_docs - input.documents = [doc["text"] for doc in reranking_results] - return input - - -if __name__ == "__main__": - mosec_reranking_endpoint = os.getenv("MOSEC_RERANKING_ENDPOINT", "http://localhost:8080") - print("NeuralSpeed Reranking Microservice Initialized.") - opea_microservices["opea_service@reranking_mosec"].start() diff --git a/tests/test_reranks_mosec-neuralspeed.sh b/tests/test_reranks_mosec-neuralspeed.sh deleted file mode 100644 index 4512dc794..000000000 --- a/tests/test_reranks_mosec-neuralspeed.sh +++ /dev/null @@ -1,84 +0,0 @@ -#!/bin/bash -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -set -x - -WORKPATH=$(dirname "$PWD") -ip_address=$(hostname -I | awk '{print $1}') - -function build_mosec_docker_images() { - cd $WORKPATH - echo $(pwd) - cp /data2/nswhl/* ./ - docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy -t langchain-mosec:neuralspeed-reranks -f comps/reranks/neural-speed/neuralspeed-docker/Dockerfile . - if [ $? -ne 0 ]; then - echo "opea/reranking-langchain-mosec-endpoint built fail" - exit 1 - else - echo "opea/reranking-langchain-mosec-endpoint built successful" - fi -} - -function build_docker_images() { - cd $WORKPATH - echo $(pwd) - docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy -t opea/reranking-langchain-mosec:neuralspeed -f comps/reranks/neural-speed/docker/Dockerfile . - if [ $? -ne 0 ]; then - echo "opea/reranking-langchain-mosec built fail" - exit 1 - else - echo "opea/reranking-langchain-mosec built successful" - fi -} - -function start_service() { - mosec_endpoint=5006 - model="BAAI/bge-reranker-large" - unset http_proxy - docker run -d --name="test-comps-reranking-langchain-mosec-endpoint" -p $mosec_endpoint:8000 langchain-mosec:neuralspeed-reranks - export MOSEC_RERANKING_ENDPOINT="http://${ip_address}:${mosec_endpoint}" - mosec_service_port=5007 - docker run -d --name="test-comps-reranking-langchain-mosec-server" -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p ${mosec_service_port}:8000 --ipc=host -e MOSEC_RERANKING_ENDPOINT=$MOSEC_RERANKING_ENDPOINT opea/reranking-langchain-mosec:neuralspeed - sleep 3m -} - -function validate_microservice() { - mosec_service_port=5007 - result=$(http_proxy="" curl http://${ip_address}:${mosec_service_port}/v1/reranking\ - -X POST \ - -d '{"initial_query":"What is Deep Learning?", "retrieved_docs": [{"text":"Deep Learning is not..."}, {"text":"Deep learning is..."}]}' \ - -H 'Content-Type: application/json') - if [[ $result == *"Deep"* ]]; then - echo "Result correct." - else - echo "Result wrong. Received was $result" - docker logs test-comps-reranking-langchain-mosec-endpoint - docker logs test-comps-reranking-langchain-mosec-server - exit 1 - fi -} - -function stop_docker() { - cid=$(docker ps -aq --filter "name=test-comps-reranking-langchain-mosec-*") - if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi -} - -function main() { - - stop_docker - - build_mosec_docker_images - - build_docker_images - - start_service - - validate_microservice - - stop_docker - echo y | docker system prune - -} - -main