Add support for Audio and Video summarization to Docsum (#865)

* v2a services Signed-off-by: Mustafa <mustafa.cetin@intel.com> * add a2t - llm Signed-off-by: Mustafa <mustafa.cetin@intel.com> * update whisper serve Signed-off-by: Mustafa <mustafa.cetin@intel.com> * updates Signed-off-by: Mustafa <mustafa.cetin@intel.com> * add data service Signed-off-by: Mustafa <mustafa.cetin@intel.com> * gateway Signed-off-by: Mustafa <mustafa.cetin@intel.com> * clean gateway & orchestrator Signed-off-by: Mustafa <mustafa.cetin@intel.com> * updates Signed-off-by: Mustafa <mustafa.cetin@intel.com> * updates Signed-off-by: Mustafa <mustafa.cetin@intel.com> * adding functional tests Signed-off-by: Mustafa <mustafa.cetin@intel.com> * updates Signed-off-by: Mustafa <mustafa.cetin@intel.com> * updates Signed-off-by: Mustafa <mustafa.cetin@intel.com> * updates read me file Signed-off-by: Mustafa <mustafa.cetin@intel.com> * name changes Signed-off-by: Mustafa <mustafa.cetin@intel.com> * update readme file Signed-off-by: Mustafa <mustafa.cetin@intel.com> * update readme file Signed-off-by: Mustafa <mustafa.cetin@intel.com> * update readme file Signed-off-by: Mustafa <mustafa.cetin@intel.com> * update readme file Signed-off-by: Mustafa <mustafa.cetin@intel.com> * update readme file Signed-off-by: Mustafa <mustafa.cetin@intel.com> * update max token option Signed-off-by: Mustafa <mustafa.cetin@intel.com> * update the test files Signed-off-by: Mustafa <mustafa.cetin@intel.com> * readme updtes Signed-off-by: Mustafa <mustafa.cetin@intel.com> * readme updtes Signed-off-by: Mustafa <mustafa.cetin@intel.com> * clean code Signed-off-by: Mustafa <mustafa.cetin@intel.com> * update dataprep-compose-cd.yaml file Signed-off-by: Mustafa <mustafa.cetin@intel.com> * merge and sync Signed-off-by: Mustafa <mustafa.cetin@intel.com> * merge and sync gateway Signed-off-by: Mustafa <mustafa.cetin@intel.com> * adding the copyright header Signed-off-by: Mustafa <mustafa.cetin@intel.com> * update the end of file char Signed-off-by: Mustafa <mustafa.cetin@intel.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update gateway Signed-off-by: Mustafa <mustafa.cetin@intel.com> * update gateway-docsum Signed-off-by: Mustafa <mustafa.cetin@intel.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix test files Signed-off-by: Mustafa <mustafa.cetin@intel.com> * fix test files Signed-off-by: Mustafa <mustafa.cetin@intel.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * merge & sync Signed-off-by: Mustafa <mustafa.cetin@intel.com> * Update dataprep-compose.yaml * update test Signed-off-by: Mustafa <mustafa.cetin@intel.com> * update dataprep-compose Signed-off-by: Mustafa <mustafa.cetin@intel.com> --------- Signed-off-by: Mustafa <mustafa.cetin@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Abolfazl Shahbazi <12436063+ashahba@users.noreply.github.com> Co-authored-by: ZePan110 <ze.pan@intel.com>
opea-project · Nov 13, 2024 · baafa40 · baafa40
1 parent 3b106c8
commit baafa40
Show file tree

Hide file tree

Showing 23 changed files with 1,350 additions and 35 deletions.
diff --git a/.github/workflows/docker/compose/dataprep-compose.yaml b/.github/workflows/docker/compose/dataprep-compose.yaml
@@ -51,3 +51,15 @@ services:
     build:
       dockerfile: comps/dataprep/neo4j/llama_index/Dockerfile
     image: ${REGISTRY:-opea}/dataprep-neo4j-llamaindex:${TAG:-latest}
+  dataprep-multimedia2text:
+    build:
+      dockerfile: comps/dataprep/multimedia2text/Dockerfile
+    image: ${REGISTRY:-opea}/dataprep-multimedia2text:${TAG:-latest}
+  dataprep-video2audio:
+    build:
+      dockerfile: comps/dataprep/multimedia2text/video2audio/Dockerfile
+    image: ${REGISTRY:-opea}/dataprep-video2audio:${TAG:-latest}
+  dataprep-audio2text:
+    build:
+      dockerfile: comps/dataprep/multimedia2text/audio2text/Dockerfile
+    image: ${REGISTRY:-opea}/dataprep-audio2text:${TAG:-latest}
diff --git a/comps/__init__.py b/comps/__init__.py
@@ -36,6 +36,8 @@
     ScoreDoc,
     PIIRequestDoc,
     PIIResponseDoc,
+    Audio2text,
+    DocSumDoc,
 )
 
 # Constants

diff --git a/comps/asr/whisper/dependency/whisper_model.py b/comps/asr/whisper/dependency/whisper_model.py
@@ -14,7 +14,14 @@
 class WhisperModel:
     """Convert audio to text."""
 
-    def __init__(self, model_name_or_path="openai/whisper-small", language="english", device="cpu", hpu_max_len=8192):
+    def __init__(
+        self,
+        model_name_or_path="openai/whisper-small",
+        language="english",
+        device="cpu",
+        hpu_max_len=8192,
+        return_timestamps=False,
+    ):
         if device == "hpu":
             # Explicitly link HPU with Torch
             from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
@@ -31,6 +38,7 @@ def __init__(self, model_name_or_path="openai/whisper-small", language="english"
 
         self.language = language
         self.hpu_max_len = hpu_max_len
+        self.return_timestamps = return_timestamps
 
         if device == "hpu":
             self._warmup_whisper_hpu_graph("https://github.com/Spycsh/assets/raw/main/ljspeech_60s_audio.wav")
@@ -104,7 +112,7 @@ def _warmup_whisper_hpu_graph(self, url):
                 )
             ),
             language=self.language,
-            return_timestamps=True,
+            return_timestamps=self.return_timestamps,
         )
 
     def audio2text(self, audio_path):
@@ -167,7 +175,7 @@ def audio2text(self, audio_path):
                 )
             ),
             language=self.language,
-            return_timestamps=True,
+            return_timestamps=self.return_timestamps,
         )
         # pylint: disable=E1101
         result = self.processor.tokenizer.batch_decode(predicted_ids, skip_special_tokens=True, normalize=True)[0]
@@ -180,7 +188,9 @@ def audio2text(self, audio_path):
 
 
 if __name__ == "__main__":
-    asr = WhisperModel(model_name_or_path="openai/whisper-small", language="english", device="cpu")
+    asr = WhisperModel(
+        model_name_or_path="openai/whisper-small", language="english", device="cpu", return_timestamps=True
+    )
 
     # Test multilanguage asr
     asr.language = "chinese"

diff --git a/comps/asr/whisper/dependency/whisper_server.py b/comps/asr/whisper/dependency/whisper_server.py
@@ -39,6 +39,7 @@ async def audio_to_text(request: Request):
 
     audio = AudioSegment.from_file(file_name)
     audio = audio.set_frame_rate(16000)
+
     audio.export(f"{file_name}", format="wav")
     try:
         asr_result = asr.audio2text(file_name)
@@ -57,8 +58,14 @@ async def audio_to_text(request: Request):
     parser.add_argument("--model_name_or_path", type=str, default="openai/whisper-small")
     parser.add_argument("--language", type=str, default="english")
     parser.add_argument("--device", type=str, default="cpu")
+    parser.add_argument("--return_timestamps", type=str, default=True)
 
     args = parser.parse_args()
-    asr = WhisperModel(model_name_or_path=args.model_name_or_path, language=args.language, device=args.device)
+    asr = WhisperModel(
+        model_name_or_path=args.model_name_or_path,
+        language=args.language,
+        device=args.device,
+        return_timestamps=args.return_timestamps,
+    )
 
     uvicorn.run(app, host=args.host, port=args.port)
diff --git a/comps/cores/mega/gateway.py b/comps/cores/mega/gateway.py
@@ -17,10 +17,11 @@
     ChatCompletionResponse,
     ChatCompletionResponseChoice,
     ChatMessage,
+    DocSumChatCompletionRequest,
     EmbeddingRequest,
     UsageInfo,
 )
-from ..proto.docarray import LLMParams, LLMParamsDoc, RerankedDoc, RerankerParms, RetrieverParms, TextDoc
+from ..proto.docarray import DocSumDoc, LLMParams, LLMParamsDoc, RerankedDoc, RerankerParms, RetrieverParms, TextDoc
 from .constants import MegaServiceEndpoint, ServiceRoleType, ServiceType
 from .micro_service import MicroService
 
@@ -409,34 +410,20 @@ async def handle_request(self, request: Request):
 class DocSumGateway(Gateway):
     def __init__(self, megaservice, host="0.0.0.0", port=8888):
         super().__init__(
-            megaservice, host, port, str(MegaServiceEndpoint.DOC_SUMMARY), ChatCompletionRequest, ChatCompletionResponse
+            megaservice,
+            host,
+            port,
+            str(MegaServiceEndpoint.DOC_SUMMARY),
+            input_datatype=DocSumChatCompletionRequest,
+            output_datatype=ChatCompletionResponse,
         )
 
-    async def handle_request(self, request: Request, files: List[UploadFile] = File(default=None)):
-        data = await request.form()
+    async def handle_request(self, request: Request):
+        data = await request.json()
         stream_opt = data.get("stream", True)
-        chat_request = ChatCompletionRequest.parse_obj(data)
-        file_summaries = []
-        if files:
-            for file in files:
-                file_path = f"/tmp/{file.filename}"
-
-                import aiofiles
-
-                async with aiofiles.open(file_path, "wb") as f:
-                    await f.write(await file.read())
-                docs = read_text_from_file(file, file_path)
-                os.remove(file_path)
-                if isinstance(docs, list):
-                    file_summaries.extend(docs)
-                else:
-                    file_summaries.append(docs)
-
-        if file_summaries:
-            prompt = self._handle_message(chat_request.messages) + "\n".join(file_summaries)
-        else:
-            prompt = self._handle_message(chat_request.messages)
+        chat_request = ChatCompletionRequest.model_validate(data)
 
+        prompt = self._handle_message(chat_request.messages)
         parameters = LLMParams(
             max_tokens=chat_request.max_tokens if chat_request.max_tokens else 1024,
             top_k=chat_request.top_k if chat_request.top_k else 10,
@@ -446,10 +433,9 @@ async def handle_request(self, request: Request, files: List[UploadFile] = File(
             presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0,
             repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03,
             streaming=stream_opt,
-            language=chat_request.language if chat_request.language else "auto",
         )
         result_dict, runtime_graph = await self.megaservice.schedule(
-            initial_inputs={"query": prompt}, llm_parameters=parameters
+            initial_inputs={data["type"]: prompt}, llm_parameters=parameters
         )
         for node, response in result_dict.items():
             # Here it suppose the last microservice in the megaservice is LLM.

diff --git a/comps/cores/proto/api_protocol.py b/comps/cores/proto/api_protocol.py
@@ -269,6 +269,14 @@ class ChatCompletionRequest(BaseModel):
     request_type: Literal["chat"] = "chat"
 
 
+class DocSumChatCompletionRequest(BaseModel):
+    llm_params: Optional[ChatCompletionRequest] = None
+    text: Optional[str] = None
+    audio: Optional[str] = None
+    video: Optional[str] = None
+    type: Optional[str] = None
+
+
 class AudioChatCompletionRequest(BaseModel):
     audio: str
     messages: Optional[

diff --git a/comps/cores/proto/docarray.py b/comps/cores/proto/docarray.py
@@ -20,6 +20,10 @@ class TextDoc(BaseDoc, TopologyInfo):
     text: str = None
 
 
+class Audio2text(BaseDoc, TopologyInfo):
+    query: str = None
+
+
 class FactualityDoc(BaseDoc):
     reference: str
     text: str
@@ -74,6 +78,12 @@ class Base64ByteStrDoc(BaseDoc):
     byte_str: str
 
 
+class DocSumDoc(BaseDoc):
+    text: Optional[str] = None
+    audio: Optional[str] = None
+    video: Optional[str] = None
+
+
 class DocPath(BaseDoc):
     path: str
     chunk_size: int = 1500

diff --git a/comps/dataprep/multimedia2text/Dockerfile b/comps/dataprep/multimedia2text/Dockerfile
@@ -0,0 +1,30 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# Use the official Python 3.11 slim image as the base image
+FROM python:3.11-slim
+
+# Set environment variables
+ENV LANG=C.UTF-8
+
+# Install necessary packages and clean up to reduce image size
+RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
+    build-essential \
+    libgl1-mesa-glx \
+    libjemalloc-dev && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Create a directory for the user and set it as the working directory
+WORKDIR /home/user
+
+# Copy the application code and requirements file to the container
+COPY comps /home/user/comps
+COPY requirements.txt /home/user/requirements.txt
+COPY ./comps/dataprep/multimedia2text/multimedia2text.py /home/user/multimedia2text.py
+
+# Install Python dependencies
+RUN python -m pip install --no-cache-dir -r requirements.txt 
+
+# Define the entry point for the container
+ENTRYPOINT ["python", "multimedia2text.py"]