diff --git a/.github/workflows/docker/compose/dataprep-compose.yaml b/.github/workflows/docker/compose/dataprep-compose.yaml
index 2e5d6f9cd..7908e8c26 100644
--- a/.github/workflows/docker/compose/dataprep-compose.yaml
+++ b/.github/workflows/docker/compose/dataprep-compose.yaml
@@ -51,3 +51,15 @@ services:
     build:
       dockerfile: comps/dataprep/neo4j/llama_index/Dockerfile
     image: ${REGISTRY:-opea}/dataprep-neo4j-llamaindex:${TAG:-latest}
+  dataprep-multimedia2text:
+    build:
+      dockerfile: comps/dataprep/multimedia2text/Dockerfile
+    image: ${REGISTRY:-opea}/dataprep-multimedia2text:${TAG:-latest}
+  dataprep-video2audio:
+    build:
+      dockerfile: comps/dataprep/multimedia2text/video2audio/Dockerfile
+    image: ${REGISTRY:-opea}/dataprep-video2audio:${TAG:-latest}
+  dataprep-audio2text:
+    build:
+      dockerfile: comps/dataprep/multimedia2text/audio2text/Dockerfile
+    image: ${REGISTRY:-opea}/dataprep-audio2text:${TAG:-latest}
diff --git a/comps/__init__.py b/comps/__init__.py
index 153acad49..ee7caaf63 100644
--- a/comps/__init__.py
+++ b/comps/__init__.py
@@ -36,6 +36,8 @@
     ScoreDoc,
     PIIRequestDoc,
     PIIResponseDoc,
+    Audio2text,
+    DocSumDoc,
 )
 
 # Constants
diff --git a/comps/asr/whisper/dependency/whisper_model.py b/comps/asr/whisper/dependency/whisper_model.py
index cc16f1637..94f1c7ce5 100644
--- a/comps/asr/whisper/dependency/whisper_model.py
+++ b/comps/asr/whisper/dependency/whisper_model.py
@@ -14,7 +14,14 @@
 class WhisperModel:
     """Convert audio to text."""
 
-    def __init__(self, model_name_or_path="openai/whisper-small", language="english", device="cpu", hpu_max_len=8192):
+    def __init__(
+        self,
+        model_name_or_path="openai/whisper-small",
+        language="english",
+        device="cpu",
+        hpu_max_len=8192,
+        return_timestamps=False,
+    ):
         if device == "hpu":
             # Explicitly link HPU with Torch
             from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
@@ -31,6 +38,7 @@ def __init__(self, model_name_or_path="openai/whisper-small", language="english"
 
         self.language = language
         self.hpu_max_len = hpu_max_len
+        self.return_timestamps = return_timestamps
 
         if device == "hpu":
             self._warmup_whisper_hpu_graph("https://github.com/Spycsh/assets/raw/main/ljspeech_60s_audio.wav")
@@ -104,7 +112,7 @@ def _warmup_whisper_hpu_graph(self, url):
                 )
             ),
             language=self.language,
-            return_timestamps=True,
+            return_timestamps=self.return_timestamps,
         )
 
     def audio2text(self, audio_path):
@@ -167,7 +175,7 @@ def audio2text(self, audio_path):
                 )
             ),
             language=self.language,
-            return_timestamps=True,
+            return_timestamps=self.return_timestamps,
         )
         # pylint: disable=E1101
         result = self.processor.tokenizer.batch_decode(predicted_ids, skip_special_tokens=True, normalize=True)[0]
@@ -180,7 +188,9 @@ def audio2text(self, audio_path):
 
 
 if __name__ == "__main__":
-    asr = WhisperModel(model_name_or_path="openai/whisper-small", language="english", device="cpu")
+    asr = WhisperModel(
+        model_name_or_path="openai/whisper-small", language="english", device="cpu", return_timestamps=True
+    )
 
     # Test multilanguage asr
     asr.language = "chinese"
diff --git a/comps/asr/whisper/dependency/whisper_server.py b/comps/asr/whisper/dependency/whisper_server.py
index 1a5c760d2..481bf0da0 100644
--- a/comps/asr/whisper/dependency/whisper_server.py
+++ b/comps/asr/whisper/dependency/whisper_server.py
@@ -39,6 +39,7 @@ async def audio_to_text(request: Request):
 
     audio = AudioSegment.from_file(file_name)
     audio = audio.set_frame_rate(16000)
+
     audio.export(f"{file_name}", format="wav")
     try:
         asr_result = asr.audio2text(file_name)
@@ -57,8 +58,14 @@ async def audio_to_text(request: Request):
     parser.add_argument("--model_name_or_path", type=str, default="openai/whisper-small")
     parser.add_argument("--language", type=str, default="english")
     parser.add_argument("--device", type=str, default="cpu")
+    parser.add_argument("--return_timestamps", type=str, default=True)
 
     args = parser.parse_args()
-    asr = WhisperModel(model_name_or_path=args.model_name_or_path, language=args.language, device=args.device)
+    asr = WhisperModel(
+        model_name_or_path=args.model_name_or_path,
+        language=args.language,
+        device=args.device,
+        return_timestamps=args.return_timestamps,
+    )
 
     uvicorn.run(app, host=args.host, port=args.port)
diff --git a/comps/cores/mega/gateway.py b/comps/cores/mega/gateway.py
index 7d075eae0..4d44f66f7 100644
--- a/comps/cores/mega/gateway.py
+++ b/comps/cores/mega/gateway.py
@@ -17,10 +17,11 @@
     ChatCompletionResponse,
     ChatCompletionResponseChoice,
     ChatMessage,
+    DocSumChatCompletionRequest,
     EmbeddingRequest,
     UsageInfo,
 )
-from ..proto.docarray import LLMParams, LLMParamsDoc, RerankedDoc, RerankerParms, RetrieverParms, TextDoc
+from ..proto.docarray import DocSumDoc, LLMParams, LLMParamsDoc, RerankedDoc, RerankerParms, RetrieverParms, TextDoc
 from .constants import MegaServiceEndpoint, ServiceRoleType, ServiceType
 from .micro_service import MicroService
 
@@ -409,34 +410,20 @@ async def handle_request(self, request: Request):
 class DocSumGateway(Gateway):
     def __init__(self, megaservice, host="0.0.0.0", port=8888):
         super().__init__(
-            megaservice, host, port, str(MegaServiceEndpoint.DOC_SUMMARY), ChatCompletionRequest, ChatCompletionResponse
+            megaservice,
+            host,
+            port,
+            str(MegaServiceEndpoint.DOC_SUMMARY),
+            input_datatype=DocSumChatCompletionRequest,
+            output_datatype=ChatCompletionResponse,
         )
 
-    async def handle_request(self, request: Request, files: List[UploadFile] = File(default=None)):
-        data = await request.form()
+    async def handle_request(self, request: Request):
+        data = await request.json()
         stream_opt = data.get("stream", True)
-        chat_request = ChatCompletionRequest.parse_obj(data)
-        file_summaries = []
-        if files:
-            for file in files:
-                file_path = f"/tmp/{file.filename}"
-
-                import aiofiles
-
-                async with aiofiles.open(file_path, "wb") as f:
-                    await f.write(await file.read())
-                docs = read_text_from_file(file, file_path)
-                os.remove(file_path)
-                if isinstance(docs, list):
-                    file_summaries.extend(docs)
-                else:
-                    file_summaries.append(docs)
-
-        if file_summaries:
-            prompt = self._handle_message(chat_request.messages) + "\n".join(file_summaries)
-        else:
-            prompt = self._handle_message(chat_request.messages)
+        chat_request = ChatCompletionRequest.model_validate(data)
 
+        prompt = self._handle_message(chat_request.messages)
         parameters = LLMParams(
             max_tokens=chat_request.max_tokens if chat_request.max_tokens else 1024,
             top_k=chat_request.top_k if chat_request.top_k else 10,
@@ -446,10 +433,9 @@ async def handle_request(self, request: Request, files: List[UploadFile] = File(
             presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0,
             repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03,
             streaming=stream_opt,
-            language=chat_request.language if chat_request.language else "auto",
         )
         result_dict, runtime_graph = await self.megaservice.schedule(
-            initial_inputs={"query": prompt}, llm_parameters=parameters
+            initial_inputs={data["type"]: prompt}, llm_parameters=parameters
         )
         for node, response in result_dict.items():
             # Here it suppose the last microservice in the megaservice is LLM.
diff --git a/comps/cores/proto/api_protocol.py b/comps/cores/proto/api_protocol.py
index cf8b2ca1d..d8d469ffb 100644
--- a/comps/cores/proto/api_protocol.py
+++ b/comps/cores/proto/api_protocol.py
@@ -269,6 +269,14 @@ class ChatCompletionRequest(BaseModel):
     request_type: Literal["chat"] = "chat"
 
 
+class DocSumChatCompletionRequest(BaseModel):
+    llm_params: Optional[ChatCompletionRequest] = None
+    text: Optional[str] = None
+    audio: Optional[str] = None
+    video: Optional[str] = None
+    type: Optional[str] = None
+
+
 class AudioChatCompletionRequest(BaseModel):
     audio: str
     messages: Optional[
diff --git a/comps/cores/proto/docarray.py b/comps/cores/proto/docarray.py
index 71b6f15ec..712b461b2 100644
--- a/comps/cores/proto/docarray.py
+++ b/comps/cores/proto/docarray.py
@@ -20,6 +20,10 @@ class TextDoc(BaseDoc, TopologyInfo):
     text: str = None
 
 
+class Audio2text(BaseDoc, TopologyInfo):
+    query: str = None
+
+
 class FactualityDoc(BaseDoc):
     reference: str
     text: str
@@ -74,6 +78,12 @@ class Base64ByteStrDoc(BaseDoc):
     byte_str: str
 
 
+class DocSumDoc(BaseDoc):
+    text: Optional[str] = None
+    audio: Optional[str] = None
+    video: Optional[str] = None
+
+
 class DocPath(BaseDoc):
     path: str
     chunk_size: int = 1500
diff --git a/comps/dataprep/multimedia2text/Dockerfile b/comps/dataprep/multimedia2text/Dockerfile
new file mode 100644
index 000000000..54b39b72f
--- /dev/null
+++ b/comps/dataprep/multimedia2text/Dockerfile
@@ -0,0 +1,30 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# Use the official Python 3.11 slim image as the base image
+FROM python:3.11-slim
+
+# Set environment variables
+ENV LANG=C.UTF-8
+
+# Install necessary packages and clean up to reduce image size
+RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
+    build-essential \
+    libgl1-mesa-glx \
+    libjemalloc-dev && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Create a directory for the user and set it as the working directory
+WORKDIR /home/user
+
+# Copy the application code and requirements file to the container
+COPY comps /home/user/comps
+COPY requirements.txt /home/user/requirements.txt
+COPY ./comps/dataprep/multimedia2text/multimedia2text.py /home/user/multimedia2text.py
+
+# Install Python dependencies
+RUN python -m pip install --no-cache-dir -r requirements.txt 
+
+# Define the entry point for the container
+ENTRYPOINT ["python", "multimedia2text.py"]
diff --git a/comps/dataprep/multimedia2text/README.md b/comps/dataprep/multimedia2text/README.md
new file mode 100644
index 000000000..3adef100e
--- /dev/null
+++ b/comps/dataprep/multimedia2text/README.md
@@ -0,0 +1,220 @@
+# Multimedia to Text Services
+
+This guide provides instructions on how to build and run various Docker services for converting multimedia content to text. The services include:
+
+1. **Whisper Service**: Converts audio to text.
+2. **A2T Service**: Another service for audio to text conversion.
+3. **Video to Audio Service**: Extracts audio from video files.
+4. **Multimedia2Text Service**: Transforms multimedia data to text data.
+
+## Prerequisites
+
+1. **Docker**: Ensure you have Docker installed and running on your system. You can download and install Docker from the [official Docker website](https://www.docker.com/get-started).
+
+2. **Proxy Settings**: If you are behind a corporate firewall, make sure you have the necessary proxy settings configured. This will ensure that Docker and other tools can access the internet.
+
+3. **Python**: If you want to validate services using the provided Python scripts, ensure you have Python 3.11 installed. The current validation tests have been tested with Python 3.11. You can check your Python version by running the following command in your terminal:
+   ```bash
+   python --version
+   ```
+
+## Getting Started
+
+First, navigate to the `GenAIComps` directory:
+
+```bash
+cd GenAIComps
+```
+
+### Whisper Service
+
+The Whisper Service converts audio files to text. Follow these steps to build and run the service:
+
+#### Build
+
+```bash
+docker build -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/whisper/dependency/Dockerfile .
+```
+
+#### Run
+
+```bash
+docker run -d -p 7066:7066 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/whisper:latest
+```
+
+### A2T Service
+
+The A2T Service is another service for converting audio to text. Follow these steps to build and run the service:
+
+#### Build
+
+```bash
+docker build -t opea/a2t:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/multimedia2text/audio2text/Dockerfile .
+```
+
+#### Run
+
+```bash
+host_ip=$(hostname -I | awk '{print $1}')
+
+docker run -d -p 9099:9099 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e A2T_ENDPOINT=http://$host_ip:7066 opea/a2t:latest
+```
+
+### Video to Audio Service
+
+The Video to Audio Service extracts audio from video files. Follow these steps to build and run the service:
+
+#### Build
+
+```bash
+docker build -t opea/v2a:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/multimedia2text/video2audio/Dockerfile .
+```
+
+#### Run
+
+```bash
+docker run -d -p 7078:7078 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/v2a:latest
+```
+
+### Multimedia2Text Service
+
+The Multimedia2Text Service transforms multimedia data to text data. Follow these steps to build and run the service:
+
+#### Build
+
+```bash
+docker build -t opea/multimedia2text:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/multimedia2text/Dockerfile .
+```
+
+#### Run
+
+```bash
+host_ip=$(hostname -I | awk '{print $1}')
+
+docker run -d -p 7079:7079 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy \
+    -e A2T_ENDPOINT=http://$host_ip:7066 \
+    -e V2A_ENDPOINT=http://$host_ip:7078 \
+    opea/multimedia2text:latest
+```
+
+## Validate Microservices
+
+After building and running the services, you can validate them using the provided Python scripts. Below are the steps to validate each service:
+
+### Whisper Service
+
+Run the following command to validate the Whisper Service:
+
+```bash
+python comps/asr/whisper/dependency/check_whisper_server.py
+```
+
+Expected output:
+
+```
+{'asr_result': 'who is pat gelsinger'}
+```
+
+### Audio2Text Service
+
+Run the following command to validate the Audio2Text Service:
+
+```bash
+python comps/dataprep/multimedia2text/audio2text/check_a2t_server.py
+```
+
+Expected output:
+
+```
+Test passed successfully!
+```
+
+_Note: The `id` value will be different._
+
+### Video2Audio Service
+
+Run the following command to validate the Video2Audio Service:
+
+```bash
+python comps/dataprep/multimedia2text/video2audio/check_v2a_microserver.py
+```
+
+Expected output:
+
+```
+========= Audio file saved as ======
+comps/dataprep/multimedia2text/video2audio/converted_audio.wav
+====================================
+```
+
+### Multimedia2Text Service
+
+Run the following command to validate the Multimedia2Text Service:
+
+```bash
+python comps/dataprep/multimedia2text/check_multimedia2text.py
+```
+
+Expected output:
+
+```
+Running test: Whisper service
+>>> Whisper service Test Passed ...
+
+Running test: Audio2Text service
+>>> Audio2Text service Test Passed ...
+
+Running test: Video2Text service
+>>> Video2Text service Test Passed ...
+
+Running test: Multimedia2text service
+>>> Multimedia2text service test for text data type passed ...
+>>> Multimedia2text service test for audio data type passed ...
+>>> Multimedia2text service test for video data type passed ...
+```
+
+## How to Stop/Remove Services
+
+To stop and remove the Docker containers and images associated with the multimedia-to-text services, follow these steps:
+
+1. **List Running Containers**: First, list all running Docker containers to identify the ones you want to stop and remove.
+
+   ```bash
+   docker ps
+   ```
+
+2. **Stop Containers**: Use the `docker stop` command followed by the container IDs or names to stop the running containers.
+
+   ```bash
+   docker stop <container_id_or_name>
+   ```
+
+   If you want to stop all running containers at once, you can use:
+
+   ```bash
+   docker stop $(docker ps -q)
+   ```
+
+3. **Remove Containers**: After stopping the containers, use the `docker rm` command followed by the container IDs or names to remove them.
+
+   ```bash
+   docker rm <container_id_or_name>
+   ```
+
+   Optionally, you can remove the stopped containers to free up resources:
+
+   ```bash
+   docker rm $(docker ps -a -q)
+   ```
+
+4. **Remove Images**: If you also want to remove the Docker images, use the `docker rmi` command followed by the image IDs or names.
+
+   ```bash
+   docker rmi <image_id_or_name>
+   ```
+
+   To remove all unused images, you can use:
+
+   ```bash
+   docker image prune -a
+   ```
diff --git a/comps/dataprep/multimedia2text/audio2text/Dockerfile b/comps/dataprep/multimedia2text/audio2text/Dockerfile
new file mode 100644
index 000000000..57707260f
--- /dev/null
+++ b/comps/dataprep/multimedia2text/audio2text/Dockerfile
@@ -0,0 +1,37 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# Use the official Python 3.11 slim image as the base image
+FROM python:3.11-slim
+
+# Create a new user and set up the home directory
+RUN useradd -m -s /bin/bash user && \
+    mkdir -p /home/user && \
+    chown -R user /home/user/
+USER user
+
+# Set environment variables
+ENV LANG=C.UTF-8
+ARG ARCH=cpu
+
+# Copy the application code and requirements file to the container
+COPY comps /home/user/comps
+COPY requirements.txt /home/user/requirements.txt
+
+# Install Python dependencies
+RUN pip install --no-cache-dir --upgrade pip && \
+    if [ "${ARCH}" = "cpu" ]; then \
+        pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu && \
+        pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cpu -r /home/user/requirements.txt ; \
+    else \
+        pip install --no-cache-dir -r /home/user/requirements.txt ; \
+    fi
+
+# Set the PYTHONPATH environment variable
+ENV PYTHONPATH=$PYTHONPATH:/home/user
+
+# Set the working directory
+WORKDIR /home/user/comps/dataprep/multimedia2text/audio2text
+
+# Define the entry point for the container
+ENTRYPOINT ["python", "audio2text.py"]
diff --git a/comps/dataprep/multimedia2text/audio2text/audio2text.py b/comps/dataprep/multimedia2text/audio2text/audio2text.py
new file mode 100644
index 000000000..650c5704c
--- /dev/null
+++ b/comps/dataprep/multimedia2text/audio2text/audio2text.py
@@ -0,0 +1,88 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+import os
+
+import requests
+
+from comps import CustomLogger
+
+# Initialize custom logger
+logger = CustomLogger("a2t")
+logflag = os.getenv("LOGFLAG", False)
+
+from comps import (
+    Audio2text,
+    Base64ByteStrDoc,
+    ServiceType,
+    TextDoc,
+    opea_microservices,
+    register_microservice,
+    register_statistics,
+)
+
+
+# Register the microservice
+@register_microservice(
+    name="opea_service@a2t",
+    service_type=ServiceType.ASR,
+    endpoint="/v1/audio/transcriptions",
+    host="0.0.0.0",
+    port=9099,
+    input_datatype=Base64ByteStrDoc,
+    output_datatype=Audio2text,
+)
+@register_statistics(names=["opea_service@a2t"])
+async def audio_to_text(audio: Base64ByteStrDoc):
+    """Convert audio to text and return the transcription.
+
+    Args:
+        audio (Base64ByteStrDoc): The incoming request containing the audio in base64 format.
+
+    Returns:
+        TextDoc: The response containing the transcription text.
+    """
+    try:
+        # Validate the input
+        if not audio or not audio.byte_str:
+            raise ValueError("Invalid input: 'audio' or 'audio.byte_str' is missing.")
+
+        byte_str = audio.byte_str
+        inputs = {"audio": byte_str}
+
+        if logflag:
+            logger.info(f"Inputs: {inputs}")
+
+        # Send the POST request to the ASR endpoint
+        response = requests.post(url=f"{a2t_endpoint}/v1/asr", data=json.dumps(inputs), proxies={"http": None})
+        response.raise_for_status()  # Raise an error for bad status codes
+
+        if logflag:
+            logger.info(f"Response: {response.json()}")
+
+        # Return the transcription result
+        return Audio2text(query=response.json()["asr_result"])  # .text
+
+    except requests.RequestException as e:
+        logger.error(f"Request to ASR endpoint failed: {e}")
+        raise
+    except Exception as e:
+        logger.error(f"An error occurred during audio to text conversion: {e}")
+        raise
+
+
+if __name__ == "__main__":
+    try:
+        # Get the ASR endpoint from environment variables or use the default
+        a2t_endpoint = os.getenv("A2T_ENDPOINT", "http://localhost:7066")
+
+        # Log initialization message
+        logger.info("[a2t - router] A2T initialized.")
+
+        # Start the microservice
+        opea_microservices["opea_service@a2t"].start()
+
+    except Exception as e:
+        logger.error(f"Failed to start the microservice: {e}")
+        raise
diff --git a/comps/dataprep/multimedia2text/audio2text/check_a2t_server.py b/comps/dataprep/multimedia2text/audio2text/check_a2t_server.py
new file mode 100644
index 000000000..8009fc543
--- /dev/null
+++ b/comps/dataprep/multimedia2text/audio2text/check_a2t_server.py
@@ -0,0 +1,86 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+import base64
+import json
+import os
+
+import requests
+
+# Get the root folder of the current script
+root_folder = os.path.dirname(os.path.abspath(__file__))
+
+
+def audio_to_text(path_to_audio):
+    """Convert an audio file to text by sending a request to the server.
+
+    Args:
+        path_to_audio (str): Path to the audio file.
+
+    Returns:
+        str: The transcribed text.
+    """
+    file_name = os.path.join(root_folder, path_to_audio)
+
+    # Read the audio file and encode it in base64
+    with open(file_name, "rb") as f:
+        audio_base64_str = base64.b64encode(f.read()).decode("utf-8")
+
+    endpoint = "http://localhost:9099/v1/audio/transcriptions"
+    inputs = {"byte_str": audio_base64_str}
+
+    # Send the POST request to the server
+    response = requests.post(url=endpoint, data=json.dumps(inputs), proxies={"http": None})
+
+    # Check if the request was successful
+    response.raise_for_status()
+
+    # Return the transcribed text
+    return response.json()["query"]
+
+
+def check_response(response):
+    """Check the response from the server and print the result.
+
+    Args:
+        response (str): The transcribed text from the server.
+    """
+    expected_response = "well"
+    assert response == expected_response, f"Expected '{expected_response}', but got '{response}'"
+    print("Test passed successfully!")
+
+
+def read_config():
+    """Read the configuration parameters from the input file.
+
+    Returns:
+        argparse.Namespace: Parsed arguments.
+    """
+    # Create an argument parser
+    parser = argparse.ArgumentParser(description="Process configuration parameters.")
+
+    # Add argument for the audio file path
+    parser.add_argument(
+        "--path_to_audio",
+        help="Location of the audio file that will be converted to text.",
+        required=False,
+        default=os.path.join(root_folder, "../data/intel_short.wav"),
+    )
+
+    # Parse the arguments
+    args = parser.parse_args()
+
+    # Return the parsed arguments
+    return args
+
+
+if __name__ == "__main__":
+    # Read the configuration parameters
+    args = read_config()
+
+    # Convert audio to text
+    response = audio_to_text(args.path_to_audio)
+
+    # Check the response
+    check_response(response)
diff --git a/comps/dataprep/multimedia2text/check_multimedia2text.py b/comps/dataprep/multimedia2text/check_multimedia2text.py
new file mode 100644
index 000000000..9aeb735a7
--- /dev/null
+++ b/comps/dataprep/multimedia2text/check_multimedia2text.py
@@ -0,0 +1,154 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import ast
+import base64
+import json
+import os
+
+import requests
+
+# Get the root folder of the current script
+root_folder = os.path.dirname(os.path.abspath(__file__))
+
+
+def get_base64_str(file_name):
+    """Convert a file to a base64 encoded string.
+
+    Args:
+        file_name (str): Path to the file.
+
+    Returns:
+        str: Base64 encoded string of the file content.
+    """
+    with open(file_name, "rb") as f:
+        return base64.b64encode(f.read()).decode("utf-8")
+
+
+def post_request(endpoint, inputs):
+    """Send a POST request to the specified endpoint.
+
+    Args:
+        endpoint (str): The URL of the endpoint.
+        inputs (dict): The data to be sent in the request.
+
+    Returns:
+        requests.Response: The response from the server.
+    """
+    return requests.post(url=endpoint, data=json.dumps(inputs), proxies={"http": None})
+
+
+def input_data_for_test(document_type):
+    """Generate input data for testing based on the document type.
+
+    Args:
+        document_type (str): The type of document ("text", "audio", or "video").
+
+    Returns:
+        str: The input data for testing.
+
+    Raises:
+        ValueError: If the document type is invalid.
+    """
+    if document_type == "text":
+        input_data = "THIS IS A TEST >>>> and a number of states are starting to adopt them voluntarily special correspondent john delenco of education week reports it takes just 10 minutes to cross through gillette wyoming this small city sits in the northeast corner of the state surrounded by 100s of miles of prairie but schools here in campbell county are on the edge of something big the next generation science standards you are going to build a strand of dna and you are going to decode it and figure out what that dna actually says for christy mathis at sage valley junior high school the new standards are about learning to think like a scientist there is a lot of really good stuff in them every standard is a performance task it is not you know the child needs to memorize these things it is the student needs to be able to do some pretty intense stuff we are analyzing we are critiquing we are."
+    elif document_type == "audio":
+        input_data = get_base64_str(os.path.join(root_folder, "data/intel_short.wav"))
+    elif document_type == "video":
+        input_data = get_base64_str(os.path.join(root_folder, "data/intel_short.mp4"))
+    else:
+        raise ValueError("Invalid document type")
+
+    return input_data
+
+
+def test_whisper_service():
+    """Test the Whisper service.
+
+    Raises:
+        AssertionError: If the service does not return a 200 status code.
+    """
+    print("Running test: Whisper service")
+    document_type = "audio"
+    endpoint = "http://localhost:7066/v1/asr"
+    inputs = {"audio": input_data_for_test(document_type)}
+    response = post_request(endpoint, inputs)
+    assert (
+        response.status_code == 200
+    ), f"Whisper service failed to get response from the server. Status code: {response.status_code}"
+
+    # If the response status code is 200, print "Test passed"
+    print(">>> Whisper service Test Passed ... ")
+    print()
+
+
+def test_audio2text():
+    """Test the Audio2Text service.
+
+    Raises:
+        AssertionError: If the service does not return a 200 status code.
+    """
+    print("Running test: Audio2Text service")
+    document_type = "audio"
+    endpoint = "http://localhost:9099/v1/audio/transcriptions"
+    inputs = {"byte_str": input_data_for_test(document_type)}
+    response = post_request(endpoint, inputs)
+    assert (
+        response.status_code == 200
+    ), f"Audio2Text service failed to get response from the server. Status code: {response.status_code}"
+
+    # If the response status code is 200, print "Test passed"
+    print(">>> Audio2Text service Test Passed ... ")
+    print()
+
+
+def test_video2text():
+    """Test the Video2Text service.
+
+    Raises:
+        AssertionError: If the service does not return a 200 status code.
+    """
+    print("Running test: Video2Text service")
+    document_type = "video"
+    endpoint = "http://localhost:7078/v1/video2audio"
+    inputs = {"byte_str": input_data_for_test(document_type)}
+    response = post_request(endpoint, inputs)
+    assert (
+        response.status_code == 200
+    ), f"Video2Text service failed to get response from the server. Status code: {response.status_code}"
+
+    # If the response status code is 200, print "Test passed"
+    print(">>> Video2Text service Test Passed ... ")
+    print()
+
+
+def test_multimedia2text_data():
+    """Test the multimedia2text service for different document types.
+
+    Raises:
+        AssertionError: If the service does not return a 200 status code.
+    """
+    print("Running test: Multimedia2text service")
+    for document_type in ["text", "audio", "video"]:
+        endpoint = "http://localhost:7079/v1/multimedia2text"
+        inputs = {document_type: input_data_for_test(document_type)}
+        response = post_request(endpoint, inputs)
+        assert (
+            response.status_code == 200
+        ), f"{document_type} service failed to get response from the server. Status code: {response.status_code}"
+
+        # If the response status code is 200, print "Test passed"
+        print(f">>> Multimedia2text service test for {document_type} data type passed ... ")
+    print()
+
+
+if __name__ == "__main__":
+    # Run the tests and print the results
+    try:
+        test_whisper_service()
+        test_audio2text()
+        test_video2text()
+        test_multimedia2text_data()
+
+    except AssertionError as e:
+        print(f"Test failed: {e}")
diff --git a/comps/dataprep/multimedia2text/data/README.md b/comps/dataprep/multimedia2text/data/README.md
new file mode 100644
index 000000000..89330dbac
--- /dev/null
+++ b/comps/dataprep/multimedia2text/data/README.md
@@ -0,0 +1,31 @@
+# Test Data for Document Summarization
+
+## Overview
+
+This document provides information about the test data used for the Document Summarization application.
+
+## Source of Test Data
+
+The data used for testing originated from the following video:
+
+[YouTube Video](https://www.youtube.com/watch?v=HUpnCtJRTg4)
+
+## Description of Test Data
+
+1. **Video File**: We extracted a 1-second segment from the above video and saved it as `intel_short.mp4`.
+2. **Audio File**: The audio was extracted from the `intel_short.mp4` video file and saved as `intel_short.wav`.
+
+These files are used to test the functionality of the Document Summarization application, including the conversion of multimedia content to text.
+
+## Files
+
+- `intel_short.mp4`: A 1-second video segment extracted from the YouTube video.
+- `intel_short.wav`: An audio file converted from the `intel_short.mp4` video file.
+
+## Usage
+
+These files can be used to validate the multimedia-to-text services provided by the Document Summarization application. Ensure that the files are placed in the appropriate directory as specified in the application's configuration.
+
+## License
+
+The original video content is subject to the terms and conditions of YouTube and the content creator. The extracted segments are used solely for testing and validation purposes.
diff --git a/comps/dataprep/multimedia2text/data/intel_short.mp4 b/comps/dataprep/multimedia2text/data/intel_short.mp4
new file mode 100644
index 000000000..6b72f4122
Binary files /dev/null and b/comps/dataprep/multimedia2text/data/intel_short.mp4 differ
diff --git a/comps/dataprep/multimedia2text/data/intel_short.wav b/comps/dataprep/multimedia2text/data/intel_short.wav
new file mode 100644
index 000000000..21657414d
Binary files /dev/null and b/comps/dataprep/multimedia2text/data/intel_short.wav differ
diff --git a/comps/dataprep/multimedia2text/multimedia2text.py b/comps/dataprep/multimedia2text/multimedia2text.py
new file mode 100644
index 000000000..68f0181c9
--- /dev/null
+++ b/comps/dataprep/multimedia2text/multimedia2text.py
@@ -0,0 +1,90 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+import os
+
+import requests
+
+from comps import CustomLogger
+
+# Initialize custom logger
+logger = CustomLogger("multimedia2text")
+
+from comps import Audio2text, DocSumDoc, ServiceType, opea_microservices, register_microservice, register_statistics
+
+
+# Register the microservice
+@register_microservice(
+    name="opea_service@multimedia2text",
+    service_type=ServiceType.ASR,
+    endpoint="/v1/multimedia2text",
+    host="0.0.0.0",
+    port=7079,
+    input_datatype=DocSumDoc,
+    output_datatype=Audio2text,
+)
+@register_statistics(names=["opea_service@multimedia2text"])
+async def audio_to_text(input: DocSumDoc):
+    """Convert video or audio input to text using external services.
+
+    Args:
+        input (DocSumDoc): Input document containing video, audio, or text data.
+
+    Returns:
+        Audio2text: Object containing the ASR result or input text.
+    """
+    response_to_return = None
+
+    # Process video input
+    if input.video is not None:
+        logger.info(f"Processing video input at {v2a_endpoint}/v1/video2audio")
+        inputs = {"byte_str": input.video}
+        response = requests.post(url=f"{v2a_endpoint}/v1/video2audio", data=json.dumps(inputs), proxies={"http": None})
+        response.raise_for_status()  # Ensure the request was successful
+        input.audio = response.json().get("byte_str")
+        if input.audio is None:
+            logger.error("Failed to extract audio from video")
+            raise ValueError("Failed to extract audio from video")
+
+    # Process audio input
+    if input.audio is not None:
+        logger.info(f"Processing audio input at {a2t_endpoint}/v1/asr")
+        inputs = {"audio": input.audio}
+        response = requests.post(url=f"{a2t_endpoint}/v1/asr", data=json.dumps(inputs), proxies={"http": None})
+        response.raise_for_status()  # Ensure the request was successful
+        response_to_return = response.json().get("asr_result")
+        if response_to_return is None:
+            logger.error("Failed to get ASR result from audio")
+            raise ValueError("Failed to get ASR result from audio")
+
+    # Process text input
+    if input.text is not None:
+        logger.info("Processing text input")
+        response_to_return = input.text
+
+    if response_to_return is None:
+        logger.warning("No valid input provided")
+        response_to_return = "No input"
+    else:
+        logger.info("Data Processing completeed")
+
+    return Audio2text(query=response_to_return)
+
+
+if __name__ == "__main__":
+    try:
+        # Get the V2T endpoint from environment variables or use the default
+        v2a_endpoint = os.getenv("V2A_ENDPOINT", "http://localhost:7078")
+        # Get the A2T endpoint from environment variables or use the default
+        a2t_endpoint = os.getenv("A2T_ENDPOINT", "http://localhost:7066")
+
+        # Log initialization message
+        logger.info("[multimedia2text - router] multimedia2text initialized.")
+
+        # Start the microservice
+        opea_microservices["opea_service@multimedia2text"].start()
+
+    except Exception as e:
+        logger.error(f"Failed to start the multimedia2text microservice: {e}")
+        raise
diff --git a/comps/dataprep/multimedia2text/video2audio/Dockerfile b/comps/dataprep/multimedia2text/video2audio/Dockerfile
new file mode 100644
index 000000000..32b2fe8ee
--- /dev/null
+++ b/comps/dataprep/multimedia2text/video2audio/Dockerfile
@@ -0,0 +1,31 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# Use the official Python 3.11 slim image as the base image
+FROM python:3.11-slim
+
+# Set environment variables
+ENV LANG=C.UTF-8
+
+# Install necessary packages
+RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
+    build-essential \
+    libgl1-mesa-glx \
+    libjemalloc-dev 
+
+# Create a directory for the user
+RUN mkdir -p /home/user
+
+# Copy the application code to the container
+COPY comps /home/user/comps
+COPY requirements.txt /home/user/requirements.txt
+COPY ./comps/dataprep/multimedia2text/video2audio/video2audio_microservice.py /home/user/video2audio_microservice.py
+
+# Install Python dependencies
+RUN python -m pip install --no-cache-dir -r /home/user/requirements.txt moviepy
+
+# Set the working directory
+WORKDIR /home/user/
+
+# Define the entry point for the container
+ENTRYPOINT ["python", "video2audio_microservice.py"]
diff --git a/comps/dataprep/multimedia2text/video2audio/check_v2a_microserver.py b/comps/dataprep/multimedia2text/video2audio/check_v2a_microserver.py
new file mode 100644
index 000000000..d8499faa1
--- /dev/null
+++ b/comps/dataprep/multimedia2text/video2audio/check_v2a_microserver.py
@@ -0,0 +1,92 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+import base64
+import json
+import os
+
+import requests
+
+# Get the root folder of the current script
+root_folder = os.path.dirname(os.path.abspath(__file__))
+
+
+def video_to_audio(path_to_video):
+    """Convert a video file to an audio file in base64 format by sending a request to the server.
+
+    Args:
+        path_to_video (str): Path to the video file.
+
+    Returns:
+        str: Base64 encoded audio file.
+    """
+    file_name = os.path.join(root_folder, path_to_video)
+
+    # Read the video file and encode it in base64
+    with open(file_name, "rb") as f:
+        video_base64_str = base64.b64encode(f.read()).decode("utf-8")
+
+    # Define the endpoint and payload
+    endpoint = "http://localhost:7078/v1/video2audio"
+    inputs = {"byte_str": video_base64_str}
+
+    # Send the POST request to the server
+    response = requests.post(url=endpoint, data=json.dumps(inputs), proxies={"http": None})
+
+    # Check if the request was successful
+    response.raise_for_status()
+
+    # Extract the base64 encoded audio from the response
+    audio_base64 = response.json()["byte_str"]
+
+    return audio_base64
+
+
+def read_config():
+    """Function to read the configuration parameters from the input file.
+    Returns the parsed arguments.
+
+    Returns:
+        argparse.Namespace: Parsed arguments.
+    """
+    # Create an argument parser
+    parser = argparse.ArgumentParser(description="Process configuration parameters.")
+
+    # Add argument for the video file path
+    parser.add_argument(
+        "--path_to_video",
+        help="Location of the video file that will be converted to audio.",
+        required=False,
+        default=os.path.join(root_folder, "../data/intel_short.mp4"),
+    )
+
+    # Add argument for the audio file path
+    parser.add_argument(
+        "--path_to_audio",
+        help="Location to save the extracted audio file.",
+        required=False,
+        default=os.path.join(root_folder, "converted_audio.wav"),
+    )
+
+    # Parse the arguments
+    args = parser.parse_args()
+
+    # Return the parsed arguments
+    return args
+
+
+if __name__ == "__main__":
+    # Read the configuration parameters
+    args = read_config()
+
+    # Extract audio from video
+    audio_base64 = video_to_audio(args.path_to_video)
+
+    # Save the extracted audio to a file
+    with open(args.path_to_audio, "wb") as f:
+        f.write(base64.b64decode(audio_base64))
+
+    print("========= Audio file saved as ======")
+    print(args.path_to_audio)
+    print("====================================")
diff --git a/comps/dataprep/multimedia2text/video2audio/video2audio.py b/comps/dataprep/multimedia2text/video2audio/video2audio.py
new file mode 100644
index 000000000..0f454f1c3
--- /dev/null
+++ b/comps/dataprep/multimedia2text/video2audio/video2audio.py
@@ -0,0 +1,88 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+import base64
+import uuid
+from os import path, remove
+
+from moviepy.editor import VideoFileClip
+
+# Get the root folder of the current script
+root_folder = path.dirname(path.abspath(__file__))
+
+
+class Video2Audio:
+    """Class to convert video files to audio files and handle base64 encoding."""
+
+    def __init__(self):
+        pass
+
+    def validate_file_exists(self, file_path):
+        """Validate if the given file exists.
+
+        Args:
+            file_path (str): Path to the file.
+
+        Raises:
+            FileNotFoundError: If the file does not exist.
+        """
+        if not path.isfile(file_path):
+            raise FileNotFoundError(f"The file {file_path} does not exist.")
+
+    def convert_video_to_audio(self, path_to_video, audio_file_name):
+        """Extract mp3 audio file from mp4 video file.
+
+        Args:
+            path_to_video (str): Path to the video file.
+            audio_file_name (str): Path to save the extracted audio file.
+        """
+        # Validate the video file exists
+        self.validate_file_exists(path_to_video)
+
+        # Extract audio from video
+        clip = VideoFileClip(path_to_video)
+        clip.audio.write_audiofile(audio_file_name)
+        print(f"Audio extracted and saved to {audio_file_name}")
+
+    def convert_base64(self, file_name):
+        """Convert a file to a base64 encoded string and remove the file.
+
+        Args:
+            file_name (str): Path to the file to be encoded.
+
+        Returns:
+            str: Base64 encoded string of the file content.
+        """
+        # Validate the file exists
+        self.validate_file_exists(file_name)
+
+        # Read the file and encode it in base64
+        with open(file_name, "rb") as f:
+            base64_str = base64.b64encode(f.read()).decode("utf-8")
+
+        # Remove the file after encoding
+        remove(file_name)
+
+        return base64_str
+
+    def convert_video_to_audio_base64(self, video_file_name):
+        """Convert a video file to an audio file and return the audio file as a base64 encoded string.
+
+        Args:
+            video_file_name (str): Path to the video file.
+
+        Returns:
+            str: Base64 encoded string of the extracted audio file.
+        """
+        # Generate a unique identifier for the audio file
+        uid = str(uuid.uuid4())
+        audio_file_name = uid + ".mp3"
+
+        # Convert the video to audio
+        self.convert_video_to_audio(video_file_name, audio_file_name)
+
+        # Convert the audio file to a base64 encoded string
+        base64_str = self.convert_base64(audio_file_name)
+
+        return base64_str
diff --git a/comps/dataprep/multimedia2text/video2audio/video2audio_microservice.py b/comps/dataprep/multimedia2text/video2audio/video2audio_microservice.py
new file mode 100644
index 000000000..f1b4b906a
--- /dev/null
+++ b/comps/dataprep/multimedia2text/video2audio/video2audio_microservice.py
@@ -0,0 +1,88 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import base64
+import json
+import os
+import uuid
+
+import requests
+
+from comps import (
+    Base64ByteStrDoc,
+    CustomLogger,
+    ServiceType,
+    opea_microservices,
+    register_microservice,
+    register_statistics,
+)
+from comps.dataprep.multimedia2text.video2audio.video2audio import Video2Audio
+
+# Initialize custom logger
+logger = CustomLogger("video2audio")
+logflag = os.getenv("LOGFLAG", False)
+
+
+# Register the microservice
+@register_microservice(
+    name="opea_service@video2audio",
+    service_type=ServiceType.DATAPREP,
+    endpoint="/v1/video2audio",
+    host="0.0.0.0",
+    port=7078,
+    input_datatype=Base64ByteStrDoc,
+    output_datatype=Base64ByteStrDoc,
+)
+@register_statistics(names=["opea_service@video2audio"])
+async def audio_to_text(request: Base64ByteStrDoc):
+    """Convert video to audio and return the result in base64 format.
+
+    Args:
+        request (Base64ByteStrDoc): The incoming request containing the video in base64 format.
+
+    Returns:
+        Base64ByteStrDoc: The response containing the audio in base64 format.
+    """
+    try:
+        # Generate a unique identifier for the video file
+        uid = str(uuid.uuid4())
+        file_name = uid + ".mp4"
+
+        logger.info("Received request for video to audio conversion.")
+        byte_str = request.byte_str
+
+        # Decode the base64 string and save it as a video file
+        with open(file_name, "wb") as f:
+            f.write(base64.b64decode(byte_str))
+
+        # Convert the video file to audio and get the result in base64 format
+        response = v2a.convert_video_to_audio_base64(file_name)
+
+        # Remove the temporary video file
+        os.remove(file_name)
+
+        logger.info("Successfully converted video to audio.")
+        return Base64ByteStrDoc(byte_str=response)
+
+    except requests.RequestException as e:
+        logger.error(f"Request to video-to-audio endpoint failed: {e}")
+        raise
+    except Exception as e:
+        logger.error(f"An error occurred during video to audio conversion: {e}")
+        raise
+
+
+if __name__ == "__main__":
+    try:
+        # Initialize the Video2Audio instance
+        v2a = Video2Audio()
+
+        # Log initialization message
+        logger.info("[video2audio - router] VIDEO2AUDIO initialized.")
+
+        # Start the microservice
+        opea_microservices["opea_service@video2audio"].start()
+
+    except Exception as e:
+        logger.error(f"Failed to start the microservice: {e}")
+        raise
diff --git a/tests/asr/test_asr_whisper.sh b/tests/asr/test_asr_whisper.sh
index 226898a8c..d0928cf34 100644
--- a/tests/asr/test_asr_whisper.sh
+++ b/tests/asr/test_asr_whisper.sh
@@ -10,14 +10,17 @@ ip_address=$(hostname -I | awk '{print $1}')
 function build_docker_images() {
     cd $WORKPATH
     echo $(pwd)
-    docker build --no-cache -t opea/whisper:comps -f comps/asr/whisper/dependency/Dockerfile .
+    docker build --no-cache -t opea/whisper:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/whisper/dependency/Dockerfile .
+
     if [ $? -ne 0 ]; then
         echo "opea/whisper built fail"
         exit 1
     else
         echo "opea/whisper built successful"
     fi
-    docker build --no-cache -t opea/asr:comps -f comps/asr/whisper/Dockerfile .
+
+    docker build --no-cache -t opea/asr:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/whisper/Dockerfile .
+
     if [ $? -ne 0 ]; then
         echo "opea/asr built fail"
         exit 1
@@ -30,7 +33,7 @@ function start_service() {
     unset http_proxy
     docker run -d --name="test-comps-asr-whisper" -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p 7066:7066 --ipc=host opea/whisper:comps
     docker run -d --name="test-comps-asr" -e ASR_ENDPOINT=http://$ip_address:7066 -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p 9089:9099 --ipc=host opea/asr:comps
-    sleep 3m
+    sleep 60s
 }
 
 function validate_microservice() {
diff --git a/tests/dataprep/test_dataprep_multimedia.sh b/tests/dataprep/test_dataprep_multimedia.sh
new file mode 100644
index 000000000..30592c86a
--- /dev/null
+++ b/tests/dataprep/test_dataprep_multimedia.sh
@@ -0,0 +1,242 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# set -xe
+
+IMAGE_REPO=${IMAGE_REPO:-"opea"}
+IMAGE_TAG=${IMAGE_TAG:-"latest"}
+echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
+echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
+
+WORKPATH=$(dirname "$PWD")
+LOG_PATH="$WORKPATH/tests"
+
+host_ip=$(hostname -I | awk '{print $1}')
+
+export REGISTRY=${IMAGE_REPO}
+export TAG=${IMAGE_TAG}
+export no_proxy="${no_proxy},${host_ip}"
+
+export V2A_SERVICE_HOST_IP=${host_ip}
+export V2A_ENDPOINT=http://$host_ip:7078
+
+export A2T_ENDPOINT=http://$host_ip:7066
+export A2T_SERVICE_HOST_IP=${host_ip}
+export A2T_SERVICE_PORT=9099
+
+export DATA_ENDPOINT=http://$host_ip:7079
+export DATA_SERVICE_HOST_IP=${host_ip}
+export DATA_SERVICE_PORT=7079
+
+# Get the root folder of the current script
+ROOT_FOLDER=$(dirname "$(readlink -f "$0")")
+
+function build_docker_images() {
+    cd $WORKPATH
+    echo "Current working directory: $(pwd)"
+
+    # Array of Docker build configurations
+    declare -A docker_builds=(
+        ["opea/whisper:comps"]="comps/asr/whisper/dependency/Dockerfile"
+        ["opea/a2t:comps"]="comps/dataprep/multimedia2text/audio2text/Dockerfile"
+        ["opea/v2a:comps"]="comps/dataprep/multimedia2text/video2audio/Dockerfile"
+        ["opea/multimedia2text:comps"]="comps/dataprep/multimedia2text/Dockerfile"
+    )
+
+    # Loop through the array and build each Docker image
+    for image in "${!docker_builds[@]}"; do
+        dockerfile=${docker_builds[$image]}
+        echo "Building Docker image: $image from Dockerfile: $dockerfile"
+
+        docker build --no-cache -t $image --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f $dockerfile .
+
+        if [ $? -ne 0 ]; then
+            echo "$image build failed"
+            exit 1
+        else
+            echo "$image build successful"
+        fi
+    done
+
+    # List Docker images and wait for 1 second
+    docker images && sleep 1s
+}
+
+function start_services() {
+
+    docker run -d -p 7066:7066 --name="test-comps-mm-whisper-service" --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/whisper:comps
+    if [ $? -ne 0 ]; then
+        echo "opea/whisper service fail to start"
+        exit 1
+    else
+        echo "opea/whisper start successful"
+    fi
+
+
+    docker run -d -p 9099:9099 --name="test-comps-mm-a2t-service" --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e A2T_ENDPOINT=http://$host_ip:7066 opea/a2t:comps
+    if [ $? -ne 0 ]; then
+        echo "opea/a2t service fail to start"
+        exit 1
+    else
+        echo "opea/a2t start successful"
+    fi
+
+    docker run -d -p 7078:7078 --name="test-comps-mm-v2a-service" --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/v2a:comps
+    if [ $? -ne 0 ]; then
+        echo "opea/v2a service fail to start"
+        exit 1
+    else
+        echo "opea/v2a start successful"
+    fi
+
+
+    docker run -d -p 7079:7079 --name="test-comps-mm-multimedia2text-service" --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy \
+        -e A2T_ENDPOINT=http://$host_ip:7066 \
+        -e V2A_ENDPOINT=http://$host_ip:7078 \
+        opea/multimedia2text:comps
+
+    if [ $? -ne 0 ]; then
+        echo "opea/multimedia2text service fail to start"
+        exit 1
+    else
+        echo "opea/multimedia2text start successful"
+    fi
+
+    sleep 120s
+
+}
+
+function validate_services() {
+    local URL="$1"
+    local EXPECTED_RESULT="$2"
+    local SERVICE_NAME="$3"
+    local DOCKER_NAME="$4"
+    local INPUT_DATA="$5"
+
+    local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
+
+    echo "==========================================="
+
+    if [ "$HTTP_STATUS" -eq 200 ]; then
+        echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+
+        local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
+
+        if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
+            echo "[ $SERVICE_NAME ] Content is as expected."
+        else
+            echo "EXPECTED_RESULT==> $EXPECTED_RESULT"
+            echo "CONTENT==> $CONTENT"
+            echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
+            docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+            exit 1
+
+        fi
+    else
+        echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+        docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+        exit 1
+    fi
+    sleep 1s
+
+}
+
+get_base64_str() {
+    local file_name=$1
+    base64 -w 0 "$file_name"
+}
+
+# Function to generate input data for testing based on the document type
+input_data_for_test() {
+    local document_type=$1
+    case $document_type in
+        ("text")
+            echo "THIS IS A TEST >>>> and a number of states are starting to adopt them voluntarily special correspondent john delenco of education week reports it takes just 10 minutes to cross through gillette wyoming this small city sits in the northeast corner of the state surrounded by 100s of miles of prairie but schools here in campbell county are on the edge of something big the next generation science standards you are going to build a strand of dna and you are going to decode it and figure out what that dna actually says for christy mathis at sage valley junior high school the new standards are about learning to think like a scientist there is a lot of really good stuff in them every standard is a performance task it is not you know the child needs to memorize these things it is the student needs to be able to do some pretty intense stuff we are analyzing we are critiquing we are."
+            ;;
+        ("audio")
+            # get_base64_str "$ROOT_FOLDER/data/test.wav"
+            get_base64_str "$WORKPATH/comps/dataprep/multimedia2text/data/intel_short.wav"
+            ;;
+        ("video")
+            # get_base64_str "$ROOT_FOLDER/data/test.mp4"
+            get_base64_str "$WORKPATH/comps/dataprep/multimedia2text/data/intel_short.mp4"
+            ;;
+        (*)
+            echo "Invalid document type" >&2
+            exit 1
+            ;;
+    esac
+}
+
+function validate_microservices() {
+    # Check if the microservices are running correctly.
+
+    # whisper microservice
+    ulimit -s 65536
+    validate_services \
+        "${host_ip}:7066/v1/asr" \
+        '{"asr_result":"well"}' \
+        "whisper-service" \
+        "whisper-service" \
+        "{\"audio\": \"$(input_data_for_test "audio")\"}"
+
+    # Audio2Text service
+    validate_services \
+        "${host_ip}:9099/v1/audio/transcriptions" \
+        '"query":"well"' \
+        "a2t" \
+        "a2t-service" \
+        "{\"byte_str\": \"$(input_data_for_test "audio")\"}"
+
+    # Video2Audio service
+    validate_services \
+        "${host_ip}:7078/v1/video2audio" \
+        "SUQzBAAAAAAAI1RTU0UAAAAPAAADTGF2ZjU4LjI5LjEwMAAAAAAAAAAAAAAA//tQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAASW5mbwAAAA8AAAAIAAAN3wAtLS0tLS0tLS0tLS1LS0tLS0tLS0tLS0tpaWlpaWlpaWlpaWlph4eHh4eHh4eHh4eHpaWlpaWlpaWlpaWlpcPDw8PDw8PDw8PDw+Hh4eHh4eHh4eHh4eH///////////////8AAAAATGF2YzU4LjU0AAAAAAAAAAAAAAAAJAYwAAAAAAAADd9L18KaAAAAAAAAAAAAAAAAAAAAAP/7kGQAAAMhClSVMEACMOAabaCMAREA" \
+        "v2a" \
+        "v2a-service" \
+        "{\"byte_str\": \"$(input_data_for_test "video")\"}"
+
+    # Docsum Data service - video
+    validate_services \
+        "${host_ip}:7079/v1/multimedia2text" \
+        '"query":"well"' \
+        "multimedia2text-service" \
+        "multimedia2text" \
+        "{\"video\": \"$(input_data_for_test "video")\"}"
+
+    # Docsum Data service - audio
+    validate_services \
+        "${host_ip}:7079/v1/multimedia2text" \
+        '"query":"well"' \
+        "multimedia2text-service" \
+        "multimedia2text" \
+        "{\"audio\": \"$(input_data_for_test "audio")\"}"
+
+    # Docsum Data service - text
+    validate_services \
+        "${host_ip}:7079/v1/multimedia2text" \
+        "THIS IS A TEST >>>> and a number of states are starting to adopt them voluntarily special correspondent john delenco" \
+        "multimedia2text-service" \
+        "multimedia2text" \
+        "{\"text\": \"$(input_data_for_test "text")\"}"
+
+}
+
+function stop_docker() {
+    cid=$(docker ps -aq --filter "name=test-comps-mm-*")
+    if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi
+    echo "All specified services have been stopped and removed."
+}
+
+function main() {
+
+    stop_docker
+    if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
+    start_services
+    validate_microservices
+    stop_docker
+    echo y | docker system prune
+}
+
+main