xorbitsai · qinxuye · Aug 16, 2024 · Aug 12, 2024 · Aug 12, 2024 · Aug 12, 2024
diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
@@ -134,6 +134,7 @@ jobs:
           pip install -e ".[dev]"
           pip install "jinja2==3.1.2"
           pip install tensorizer
+          pip install eva-decord
         working-directory: .
 
       - name: Test with pytest

diff --git a/setup.cfg b/setup.cfg
@@ -122,6 +122,7 @@ all =
     openai-whisper  # For CosyVoice
     boto3>=1.28.55,<1.28.65 # For tensorizer
     tensorizer~=2.9.0
+    eva-decord  # For video in VL
 intel =
     torch==2.1.0a0
     intel_extension_for_pytorch==2.1.10+xpu
@@ -144,6 +145,7 @@ transformers =
     timm>=0.9.16  # For deepseek VL
     torchvision  # For deepseek VL
     peft
+    eva-decord  # For video in VL
 vllm =
     vllm>=0.2.6
 sglang =

diff --git a/xinference/core/chat_interface.py b/xinference/core/chat_interface.py
@@ -236,8 +236,8 @@ def predict(history, bot, max_tokens, temperature, stream):
                 bot[-1][1] = history[-1]["content"]
                 yield history, bot
 
-        def add_text(history, bot, text, image):
-            logger.debug("Add text, text: %s, image: %s", text, image)
+        def add_text(history, bot, text, image, video):
+            logger.debug("Add text, text: %s, image: %s, video: %s", text, image, video)
             if image:
                 buffered = BytesIO()
                 with PIL.Image.open(image) as img:
@@ -257,16 +257,47 @@ def add_text(history, bot, text, image):
                         },
                     ],
                 }
+            elif video:
+
+                def video_to_base64(video_path):
+                    with open(video_path, "rb") as video_file:
+                        encoded_string = base64.b64encode(video_file.read()).decode(
+                            "utf-8"
+                        )
+                    return encoded_string
+
+                def generate_html_video(video_path):
+                    base64_video = video_to_base64(video_path)
+                    video_format = video_path.split(".")[-1]
+                    html_code = f"""
+                    <video controls>
+                        <source src="data:video/{video_format};base64,{base64_video}" type="video/{video_format}">
+                        Your browser does not support the video tag.
+                    </video>
+                    """
+                    return html_code
+
+                display_content = f"{generate_html_video(video)}\n{text}"
+                message = {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": text},
+                        {
+                            "type": "video_url",
+                            "video_url": {"url": video},
+                        },
+                    ],
+                }
             else:
                 display_content = text
                 message = {"role": "user", "content": text}
             history = history + [message]
             bot = bot + [[display_content, None]]
-            return history, bot, "", None
+            return history, bot, "", None, None
 
         def clear_history():
             logger.debug("Clear history.")
-            return [], None, "", None
+            return [], None, "", None, None
 
         def update_button(text):
             return gr.update(interactive=bool(text))
@@ -313,6 +344,7 @@ def update_button(text):
                 )
                 with gr.Column(scale=3):
                     imagebox = gr.Image(type="filepath")
+                    videobox = gr.Video()
                     textbox = gr.Textbox(
                         show_label=False,
                         placeholder="Enter text and press ENTER",
@@ -340,8 +372,8 @@ def update_button(text):
 
             textbox.submit(
                 add_text,
-                [state, chatbot, textbox, imagebox],
-                [state, chatbot, textbox, imagebox],
+                [state, chatbot, textbox, imagebox, videobox],
+                [state, chatbot, textbox, imagebox, videobox],
                 queue=False,
             ).then(
                 predict,
@@ -351,8 +383,8 @@ def update_button(text):
 
             submit_btn.click(
                 add_text,
-                [state, chatbot, textbox, imagebox],
-                [state, chatbot, textbox, imagebox],
+                [state, chatbot, textbox, imagebox, videobox],
+                [state, chatbot, textbox, imagebox, videobox],
                 queue=False,
             ).then(
                 predict,
@@ -361,7 +393,10 @@ def update_button(text):
             )
 
             clear_btn.click(
-                clear_history, None, [state, chatbot, textbox, imagebox], queue=False
+                clear_history,
+                None,
+                [state, chatbot, textbox, imagebox, videobox],
+                queue=False,
             )
 
         return chat_vl_interface

diff --git a/xinference/deploy/docker/requirements.txt b/xinference/deploy/docker/requirements.txt
@@ -61,6 +61,7 @@ openai-whisper  # For CosyVoice
 boto3>=1.28.55,<1.28.65 # For tensorizer
 tensorizer~=2.9.0
 imageio-ffmpeg  # For video
+eva-decord  # For video in VL
 
 # sglang
 outlines>=0.0.44

diff --git a/xinference/deploy/docker/requirements_cpu.txt b/xinference/deploy/docker/requirements_cpu.txt
@@ -56,3 +56,4 @@ onnxruntime-gpu==1.16.0; sys_platform == 'linux'  # For CosyVoice
 onnxruntime==1.16.0; sys_platform == 'darwin' or sys_platform == 'windows'  # For CosyVoice
 openai-whisper  # For CosyVoice
 imageio-ffmpeg  # For video
+eva-decord  # For video in VL
diff --git a/xinference/model/llm/pytorch/minicpmv26.py b/xinference/model/llm/pytorch/minicpmv26.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import base64
-import json
 import logging
 import time
 import uuid
@@ -124,29 +123,60 @@ def _load_image(_url):
                 else:
                     return Image.open(BytesIO(response.content)).convert("RGB")
 
+        MAX_NUM_FRAMES = 64
+
+        def encode_video(video_path):
+            from decord import VideoReader, cpu
+
+            def uniform_sample(l, n):
+                gap = len(l) / n
+                idxs = [int(i * gap + gap / 2) for i in range(n)]
+                return [l[i] for i in idxs]
+
+            vr = VideoReader(video_path, ctx=cpu(0))
+            sample_fps = round(vr.get_avg_fps() / 1)  # FPS
+            frame_idx = [i for i in range(0, len(vr), sample_fps)]
+            if len(frame_idx) > MAX_NUM_FRAMES:
+                frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
+            frames = vr.get_batch(frame_idx).asnumpy()
+            frames = [Image.fromarray(v.astype("uint8")) for v in frames]
+            print("num frames:", len(frames))
+            return frames
+
+        def _load_video(_url):
+            frames = None
+            if _url.startswith("data:"):
+                raise RuntimeError("Only video url format is supported")
+            else:
+                frames = encode_video(_url)
+            return frames
+
         if not isinstance(content, str):
             texts = []
             image_urls = []
+            video_urls = []
             for c in content:
                 c_type = c.get("type")
                 if c_type == "text":
                     texts.append(c["text"])
                 elif c_type == "image_url":
                     image_urls.append(c["image_url"]["url"])
+                elif c_type == "video_url":
+                    video_urls.append(c["video_url"]["url"])
             image_futures = []
             with ThreadPoolExecutor() as executor:
                 for image_url in image_urls:
                     fut = executor.submit(_load_image, image_url)
                     image_futures.append(fut)
             images = [fut.result() for fut in image_futures]
+            frames = []
+            if len(video_urls) > 1:
+                raise RuntimeError("Only one video per message is supported")
+            for v in video_urls:
+                frames = _load_video(v)
             text = " ".join(texts)
-            if len(images) == 0:
-                return text, []
-            elif len(images) == 1:
-                return text, images
-            else:
-                raise RuntimeError("Only one image per message is supported")
-        return content, []
+            return text, images, frames
+        return content, [], []
 
     def chat(
         self,
@@ -156,36 +186,51 @@ def chat(
         generate_config: Optional[PytorchGenerateConfig] = None,
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
         stream = generate_config.get("stream", False) if generate_config else False
-        content, images_chat = self._message_content_to_chat(prompt)
+        videoExisted = False
+
+        content, images_chat, video_frames = self._message_content_to_chat(prompt)
+        if len(video_frames) > 0:
+            videoExisted = True
+            images_chat = video_frames
 
         msgs = []
         query_to_response: List[Dict] = []
-        images_history = []
         for h in chat_history or []:
+            images_history = []
             role = h["role"]
-            content_h, images_tmp = self._message_content_to_chat(h["content"])
+            content_h, images_tmp, video_frames_h = self._message_content_to_chat(
+                h["content"]
+            )
             if images_tmp != []:
                 images_history = images_tmp
+            if len(video_frames_h) > 0:
+                videoExisted = True
+                images_history = video_frames_h
             if len(query_to_response) == 0 and role == "user":
-                query_to_response.append({"role": "user", "content": content_h})
+                query_to_response.append(
+                    {"role": "user", "content": images_history + [content_h]}
+                )
             if len(query_to_response) == 1 and role == "assistant":
-                query_to_response.append({"role": "assistant", "content": content_h})
+                query_to_response.append(
+                    {"role": "assistant", "content": images_history + [content_h]}
+                )
             if len(query_to_response) == 2:
                 msgs.extend(query_to_response)
                 query_to_response = []
-        image = None
-        if len(images_chat) > 0:
-            image = images_chat[0]
-        elif len(images_history) > 0:
-            image = images_history[0]
-        msgs.append({"role": "user", "content": content})
+        msgs.append({"role": "user", "content": images_chat + [content]})
+
+        # Set decode params for video
+        params = {}
+        if videoExisted:
+            params = {"use_image_id": False, "max_slice_nums": 1}
 
         chat = self._model.chat(
-            image=image,
-            msgs=json.dumps(msgs, ensure_ascii=True),
+            image=None,
+            msgs=msgs,
             tokenizer=self._tokenizer,
             sampling=True,
-            **generate_config
+            **generate_config,
+            **params,
         )
         if stream:
             it = self.chat_stream(chat)