Skip to content

Commit

Permalink
update ablation for videomme datasets
Browse files Browse the repository at this point in the history
  • Loading branch information
choiszt authored and kcz358 committed Jun 20, 2024
1 parent 4587665 commit dbe6329
Show file tree
Hide file tree
Showing 3 changed files with 158 additions and 9 deletions.
123 changes: 114 additions & 9 deletions lmms_eval/tasks/videomme/utils.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
import sys
from typing import List, Dict, Optional, Union
import re
import cv2
import numpy as np

eval_logger = logging.getLogger("lmms-eval")

Expand Down Expand Up @@ -80,17 +82,55 @@
# cache_dir = os.path.join(hf_home, cache_dir)
# base_cache_dir = config["dataset_kwargs"]["cache_dir"]
base_cache_dir = os.path.expanduser(hf_home)

with open(Path(__file__).parent / "videomme.yaml", "r") as f:
raw_data = f.readlines()
safe_data = []
for i, line in enumerate(raw_data):
# remove function definition since yaml load cannot handle it
if "!function" not in line:
safe_data.append(line)
cache_name = yaml.safe_load("".join(safe_data))["dataset_kwargs"]["cache_dir"]


def parse_subtitle_time(time_str):
h, m, s_ms = time_str.split(':')
s, ms = s_ms.split(',')
return int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000

def load_subtitles(subtitle_path):
subtitles = {}
with open(subtitle_path, 'r', encoding='utf-8') as file:
content = file.read().split('\n\n')
for section in content:
if section.strip():
lines = section.split('\n')
if len(lines) >= 3:
time_range = lines[1].split(' --> ')
start_time = parse_subtitle_time(time_range[0])
end_time = parse_subtitle_time(time_range[1])
text = ' '.join(line for line in lines[2:])
subtitles[(start_time, end_time)] = text
return subtitles

def convert_time_to_frame(time_in_seconds, fps):
return int(time_in_seconds * fps)

def extract_subtitles(video_path, subtitle_path):
video = cv2.VideoCapture(video_path)
fps = video.get(cv2.CAP_PROP_FPS)
total_frame=int(video.get(cv2.CAP_PROP_FRAME_COUNT))
subtitles = load_subtitles(subtitle_path)

subtitle_frames = []
for (start_time, end_time), text in subtitles.items():
start_frame = convert_time_to_frame(start_time, fps)
end_frame = convert_time_to_frame(end_time, fps)
subtitle_frames.append((start_frame, end_frame, text))

return subtitle_frames,total_frame

def videomme_doc_to_visual(doc):
with open(Path(__file__).parent / "videomme.yaml", "r") as f:
raw_data = f.readlines()
safe_data = []
for i, line in enumerate(raw_data):
# remove function definition since yaml load cannot handle it
if "!function" not in line:
safe_data.append(line)
cache_name = yaml.safe_load("".join(safe_data))["dataset_kwargs"]["cache_dir"]

cache_dir = os.path.join(base_cache_dir, cache_name)
video_path = doc["videoID"] + ".mp4"
video_path = os.path.join(cache_dir, video_path)
Expand All @@ -106,6 +146,71 @@ def videomme_doc_to_visual(doc):


def videomme_doc_to_text(doc, model_specific_prompt_kwargs=None):
option_prompt="Select the best answer to the following multiple-choice question based on the video and the subtitles. Respond with only the letter (A, B, C, or D) of the correct option."
question = doc["question"]
option = str(doc["options"])
question = question + "\n" + option
full_prompt=option_prompt+"\n"+question+"\n"+"The best answer is:"
return full_prompt
# Frames + Subs
# This video's subtitles are listed below:
# 【subtitles】

# Select the best answer to the following multiple-choice question based on the video and the subtitles. Respond with only the letter (A, B, C, or D) of the correct option.
# 【question】
# The best answer is:
# Frames / Frames + Audio
# Select the best answer to the following multiple-choice question based on the video. Respond with only the letter (A, B, C, or D) of the correct option.
# 【question】
# The best answer is:

def videomme_doc_to_text_subtitle(doc, model_specific_prompt_kwargs=None):
cache_dir = os.path.join(base_cache_dir, cache_name)
video_path = doc["videoID"] + ".mp4"
subtitle_path=os.path.join(cache_dir,"subtitle",doc["videoID"]+".srt")
video_path = os.path.join(cache_dir, video_path)
if os.path.exists(subtitle_path): #Denote have subtitle
subtitle=open(subtitle_path).readlines()
else:
subtitle=""
subtitles_prompt="This video's subtitles are listed below: \n"
if subtitle=="":
subtitle="No subtitles available"
else:
if "gemini_api_flag" in model_specific_prompt_kwargs: #specific for gemini_api
if model_specific_prompt_kwargs['gemini_api_flag']=="full subtitle":
textlist=[]
for ele in subtitle:
pattern = r'<font color="white" size=".72c">(.*?)</font>'
matches = re.findall(pattern, ele)
if matches:
textlist.append(matches[0])
subtitle_text="\n".join(textlist)
else:
if "frame_num" in model_specific_prompt_kwargs:
frame_num=model_specific_prompt_kwargs['frame_num']
subtitle_by_frame,total_frame=extract_subtitles(video_path,subtitle_path)
uniform_sampled_frames = np.linspace(0, total_frame - 1, frame_num, dtype=int).tolist()

subtitle_by_frame_idx=[]
for frame_idx in uniform_sampled_frames:
for idx,title in enumerate(subtitle_by_frame):
if frame_idx<title[1] and frame_idx>=title[0]:
subtitle_by_frame_idx.append(idx)
subtitle_by_frame_idx=list(set(subtitle_by_frame_idx))

textlist=[]
for idx in subtitle_by_frame_idx:
pattern = r'<font color="white" size=".72c">(.*?)</font>'
raw_text=re.findall(pattern, subtitle_by_frame[idx][2])
try:
textlist.append(raw_text[0])
except:
continue
subtitle_text="\n".join(textlist)
subtitle=subtitle_text

option_prompt="Select the best answer to the following multiple-choice question based on the video and the subtitles. Respond with only the letter (A, B, C, or D) of the correct option."
question = doc["question"]
option = str(doc["options"])
question = question + "\n" + option + model_specific_prompt_kwargs["post_prompt"]
Expand Down
Empty file modified lmms_eval/tasks/videomme/videomme.yaml
100755 → 100644
Empty file.
44 changes: 44 additions & 0 deletions lmms_eval/tasks/videomme/videomme_w_subtitle.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
dataset_path: lmms-lab/Video-MME
dataset_kwargs:
token: True
cache_dir: videomme
video: True
# From_YouTube: True
task: videomme_w_subtitle
test_split: test
output_type: generate_until
doc_to_visual: !function utils.videomme_doc_to_visual
doc_to_text: !function utils.videomme_doc_to_text_subtitle
doc_to_target: "answer"
generation_kwargs:
max_new_tokens: 16
temperature: 0
top_p: 1.0
num_beams: 1
do_sample: false
# The return value of process_results will be used by metrics
process_results: !function utils.videomme_process_results
# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
metric_list:
- metric: videomme_percetion_score
aggregation: !function utils.videomme_aggregate_results
higher_is_better: true
model_specific_prompt_kwargs:
default:
frame_num: 32
gemini_api:
gemini_api_flag: "full subtitle"
# gpt4v:
# pre_prompt: ""
# post_prompt:
# # qwen_vl:
# # pre_prompt: ""
# # post_prompt: " Answer:"
# # otterhd:
# # pre_prompt: ""
# # post_prompt: " Answer:"
# xcomposer2_4khd:
# pre_prompt: "[UNUSED_TOKEN_146]user\n"
# post_prompt: " Answer this question with A, B, C, or D.[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n"
metadata:
- version: 0.0

0 comments on commit dbe6329

Please sign in to comment.