Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: make MiniCPM v2.6 support video #2068

Merged
merged 13 commits into from
Aug 16, 2024
1 change: 1 addition & 0 deletions .github/workflows/python.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ jobs:
pip install -e ".[dev]"
pip install "jinja2==3.1.2"
pip install tensorizer
pip install eva-decord
working-directory: .

- name: Test with pytest
Expand Down
2 changes: 2 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ all =
openai-whisper # For CosyVoice
boto3>=1.28.55,<1.28.65 # For tensorizer
tensorizer~=2.9.0
eva-decord # For video in VL
intel =
torch==2.1.0a0
intel_extension_for_pytorch==2.1.10+xpu
Expand All @@ -144,6 +145,7 @@ transformers =
timm>=0.9.16 # For deepseek VL
torchvision # For deepseek VL
peft
eva-decord # For video in VL
vllm =
vllm>=0.2.6
sglang =
Expand Down
53 changes: 44 additions & 9 deletions xinference/core/chat_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,8 +236,8 @@ def predict(history, bot, max_tokens, temperature, stream):
bot[-1][1] = history[-1]["content"]
yield history, bot

def add_text(history, bot, text, image):
logger.debug("Add text, text: %s, image: %s", text, image)
def add_text(history, bot, text, image, video):
logger.debug("Add text, text: %s, image: %s, video: %s", text, image, video)
if image:
buffered = BytesIO()
with PIL.Image.open(image) as img:
Expand All @@ -257,16 +257,47 @@ def add_text(history, bot, text, image):
},
],
}
elif video:

def video_to_base64(video_path):
with open(video_path, "rb") as video_file:
encoded_string = base64.b64encode(video_file.read()).decode(
"utf-8"
)
return encoded_string

def generate_html_video(video_path):
base64_video = video_to_base64(video_path)
video_format = video_path.split(".")[-1]
html_code = f"""
<video controls>
<source src="data:video/{video_format};base64,{base64_video}" type="video/{video_format}">
Your browser does not support the video tag.
</video>
"""
return html_code

display_content = f"{generate_html_video(video)}\n{text}"
message = {
"role": "user",
"content": [
{"type": "text", "text": text},
{
"type": "video_url",
"video_url": {"url": video},
},
],
}
else:
display_content = text
message = {"role": "user", "content": text}
history = history + [message]
bot = bot + [[display_content, None]]
return history, bot, "", None
return history, bot, "", None, None

def clear_history():
logger.debug("Clear history.")
return [], None, "", None
return [], None, "", None, None

def update_button(text):
return gr.update(interactive=bool(text))
Expand Down Expand Up @@ -313,6 +344,7 @@ def update_button(text):
)
with gr.Column(scale=3):
imagebox = gr.Image(type="filepath")
videobox = gr.Video()
textbox = gr.Textbox(
show_label=False,
placeholder="Enter text and press ENTER",
Expand Down Expand Up @@ -340,8 +372,8 @@ def update_button(text):

textbox.submit(
add_text,
[state, chatbot, textbox, imagebox],
[state, chatbot, textbox, imagebox],
[state, chatbot, textbox, imagebox, videobox],
[state, chatbot, textbox, imagebox, videobox],
queue=False,
).then(
predict,
Expand All @@ -351,8 +383,8 @@ def update_button(text):

submit_btn.click(
add_text,
[state, chatbot, textbox, imagebox],
[state, chatbot, textbox, imagebox],
[state, chatbot, textbox, imagebox, videobox],
[state, chatbot, textbox, imagebox, videobox],
queue=False,
).then(
predict,
Expand All @@ -361,7 +393,10 @@ def update_button(text):
)

clear_btn.click(
clear_history, None, [state, chatbot, textbox, imagebox], queue=False
clear_history,
None,
[state, chatbot, textbox, imagebox, videobox],
queue=False,
)

return chat_vl_interface
Expand Down
1 change: 1 addition & 0 deletions xinference/deploy/docker/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ openai-whisper # For CosyVoice
boto3>=1.28.55,<1.28.65 # For tensorizer
tensorizer~=2.9.0
imageio-ffmpeg # For video
eva-decord # For video in VL

# sglang
outlines>=0.0.44
Expand Down
1 change: 1 addition & 0 deletions xinference/deploy/docker/requirements_cpu.txt
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,4 @@ onnxruntime-gpu==1.16.0; sys_platform == 'linux' # For CosyVoice
onnxruntime==1.16.0; sys_platform == 'darwin' or sys_platform == 'windows' # For CosyVoice
openai-whisper # For CosyVoice
imageio-ffmpeg # For video
eva-decord # For video in VL
89 changes: 67 additions & 22 deletions xinference/model/llm/pytorch/minicpmv26.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import base64
import json
import logging
import time
import uuid
Expand Down Expand Up @@ -124,29 +123,60 @@ def _load_image(_url):
else:
return Image.open(BytesIO(response.content)).convert("RGB")

MAX_NUM_FRAMES = 64

def encode_video(video_path):
from decord import VideoReader, cpu

def uniform_sample(l, n):
gap = len(l) / n
idxs = [int(i * gap + gap / 2) for i in range(n)]
return [l[i] for i in idxs]

vr = VideoReader(video_path, ctx=cpu(0))
sample_fps = round(vr.get_avg_fps() / 1) # FPS
frame_idx = [i for i in range(0, len(vr), sample_fps)]
if len(frame_idx) > MAX_NUM_FRAMES:
frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
frames = vr.get_batch(frame_idx).asnumpy()
frames = [Image.fromarray(v.astype("uint8")) for v in frames]
print("num frames:", len(frames))
return frames

def _load_video(_url):
frames = None
if _url.startswith("data:"):
raise RuntimeError("Only video url format is supported")
else:
frames = encode_video(_url)
return frames

if not isinstance(content, str):
texts = []
image_urls = []
video_urls = []
for c in content:
c_type = c.get("type")
if c_type == "text":
texts.append(c["text"])
elif c_type == "image_url":
image_urls.append(c["image_url"]["url"])
elif c_type == "video_url":
video_urls.append(c["video_url"]["url"])
image_futures = []
with ThreadPoolExecutor() as executor:
for image_url in image_urls:
fut = executor.submit(_load_image, image_url)
image_futures.append(fut)
images = [fut.result() for fut in image_futures]
frames = []
if len(video_urls) > 1:
raise RuntimeError("Only one video per message is supported")
for v in video_urls:
frames = _load_video(v)
text = " ".join(texts)
if len(images) == 0:
return text, []
elif len(images) == 1:
return text, images
else:
raise RuntimeError("Only one image per message is supported")
return content, []
return text, images, frames
return content, [], []

def chat(
self,
Expand All @@ -156,36 +186,51 @@ def chat(
generate_config: Optional[PytorchGenerateConfig] = None,
) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
stream = generate_config.get("stream", False) if generate_config else False
content, images_chat = self._message_content_to_chat(prompt)
videoExisted = False

content, images_chat, video_frames = self._message_content_to_chat(prompt)
if len(video_frames) > 0:
videoExisted = True
images_chat = video_frames

msgs = []
query_to_response: List[Dict] = []
images_history = []
for h in chat_history or []:
images_history = []
role = h["role"]
content_h, images_tmp = self._message_content_to_chat(h["content"])
content_h, images_tmp, video_frames_h = self._message_content_to_chat(
h["content"]
)
if images_tmp != []:
images_history = images_tmp
if len(video_frames_h) > 0:
videoExisted = True
images_history = video_frames_h
if len(query_to_response) == 0 and role == "user":
query_to_response.append({"role": "user", "content": content_h})
query_to_response.append(
{"role": "user", "content": images_history + [content_h]}
)
if len(query_to_response) == 1 and role == "assistant":
query_to_response.append({"role": "assistant", "content": content_h})
query_to_response.append(
{"role": "assistant", "content": images_history + [content_h]}
)
if len(query_to_response) == 2:
msgs.extend(query_to_response)
query_to_response = []
image = None
if len(images_chat) > 0:
image = images_chat[0]
elif len(images_history) > 0:
image = images_history[0]
msgs.append({"role": "user", "content": content})
msgs.append({"role": "user", "content": images_chat + [content]})

# Set decode params for video
params = {}
if videoExisted:
params = {"use_image_id": False, "max_slice_nums": 1}

chat = self._model.chat(
image=image,
msgs=json.dumps(msgs, ensure_ascii=True),
image=None,
msgs=msgs,
tokenizer=self._tokenizer,
sampling=True,
**generate_config
**generate_config,
**params,
)
if stream:
it = self.chat_stream(chat)
Expand Down
Loading