update ablation for videomme datasets

EvolvingLMMs-Lab · Jun 20, 2024 · dbe6329 · dbe6329
1 parent 4587665
commit dbe6329
Show file tree

Hide file tree

Showing 3 changed files with 158 additions and 9 deletions.
diff --git a/lmms_eval/tasks/videomme/utils.py b/lmms_eval/tasks/videomme/utils.py
@@ -10,6 +10,8 @@
 import sys
 from typing import List, Dict, Optional, Union
 import re
+import cv2
+import numpy as np
 
 eval_logger = logging.getLogger("lmms-eval")
 
@@ -80,17 +82,55 @@
 # cache_dir = os.path.join(hf_home, cache_dir)
 # base_cache_dir = config["dataset_kwargs"]["cache_dir"]
 base_cache_dir = os.path.expanduser(hf_home)
-
+with open(Path(__file__).parent / "videomme.yaml", "r") as f:
+    raw_data = f.readlines()
+    safe_data = []
+    for i, line in enumerate(raw_data):
+        # remove function definition since yaml load cannot handle it
+        if "!function" not in line:
+            safe_data.append(line)
+cache_name = yaml.safe_load("".join(safe_data))["dataset_kwargs"]["cache_dir"]
+
+
+def parse_subtitle_time(time_str):
+    h, m, s_ms = time_str.split(':')
+    s, ms = s_ms.split(',')
+    return int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000
+
+def load_subtitles(subtitle_path):
+    subtitles = {}
+    with open(subtitle_path, 'r', encoding='utf-8') as file:
+        content = file.read().split('\n\n')
+        for section in content:
+            if section.strip():
+                lines = section.split('\n')
+                if len(lines) >= 3:
+                    time_range = lines[1].split(' --> ')
+                    start_time = parse_subtitle_time(time_range[0])
+                    end_time = parse_subtitle_time(time_range[1])
+                    text = ' '.join(line for line in lines[2:])
+                    subtitles[(start_time, end_time)] = text
+    return subtitles
+
+def convert_time_to_frame(time_in_seconds, fps):
+    return int(time_in_seconds * fps)
+
+def extract_subtitles(video_path, subtitle_path):
+    video = cv2.VideoCapture(video_path)
+    fps = video.get(cv2.CAP_PROP_FPS)
+    total_frame=int(video.get(cv2.CAP_PROP_FRAME_COUNT))
+    subtitles = load_subtitles(subtitle_path)
+
+    subtitle_frames = []
+    for (start_time, end_time), text in subtitles.items():
+        start_frame = convert_time_to_frame(start_time, fps)
+        end_frame = convert_time_to_frame(end_time, fps)
+        subtitle_frames.append((start_frame, end_frame, text))
+
+    return subtitle_frames,total_frame
 
 def videomme_doc_to_visual(doc):
-    with open(Path(__file__).parent / "videomme.yaml", "r") as f:
-        raw_data = f.readlines()
-        safe_data = []
-        for i, line in enumerate(raw_data):
-            # remove function definition since yaml load cannot handle it
-            if "!function" not in line:
-                safe_data.append(line)
-    cache_name = yaml.safe_load("".join(safe_data))["dataset_kwargs"]["cache_dir"]
+
     cache_dir = os.path.join(base_cache_dir, cache_name)
     video_path = doc["videoID"] + ".mp4"
     video_path = os.path.join(cache_dir, video_path)
@@ -106,6 +146,71 @@ def videomme_doc_to_visual(doc):
 
 
 def videomme_doc_to_text(doc, model_specific_prompt_kwargs=None):
+    option_prompt="Select the best answer to the following multiple-choice question based on the video and the subtitles. Respond with only the letter (A, B, C, or D) of the correct option."
+    question = doc["question"]
+    option = str(doc["options"])
+    question = question + "\n" + option
+    full_prompt=option_prompt+"\n"+question+"\n"+"The best answer is:"
+    return full_prompt
+# Frames + Subs
+# This video's subtitles are listed below: 
+# 【subtitles】
+
+# Select the best answer to the following multiple-choice question based on the video and the subtitles. Respond with only the letter (A, B, C, or D) of the correct option.
+# 【question】
+# The best answer is:
+# Frames / Frames + Audio
+# Select the best answer to the following multiple-choice question based on the video. Respond with only the letter (A, B, C, or D) of the correct option.
+# 【question】
+# The best answer is:
+
+def videomme_doc_to_text_subtitle(doc, model_specific_prompt_kwargs=None):
+    cache_dir = os.path.join(base_cache_dir, cache_name)
+    video_path = doc["videoID"] + ".mp4"
+    subtitle_path=os.path.join(cache_dir,"subtitle",doc["videoID"]+".srt")
+    video_path = os.path.join(cache_dir, video_path)
+    if os.path.exists(subtitle_path): #Denote have subtitle
+        subtitle=open(subtitle_path).readlines()
+    else:
+        subtitle=""
+    subtitles_prompt="This video's subtitles are listed below: \n"
+    if subtitle=="":
+        subtitle="No subtitles available"
+    else:
+        if "gemini_api_flag" in model_specific_prompt_kwargs: #specific for gemini_api
+            if model_specific_prompt_kwargs['gemini_api_flag']=="full subtitle":
+                textlist=[]
+                for ele in subtitle:
+                    pattern = r'<font color="white" size=".72c">(.*?)</font>'
+                    matches = re.findall(pattern, ele)
+                    if matches:
+                        textlist.append(matches[0])
+                subtitle_text="\n".join(textlist)
+        else:
+            if "frame_num" in model_specific_prompt_kwargs:
+                frame_num=model_specific_prompt_kwargs['frame_num']
+                subtitle_by_frame,total_frame=extract_subtitles(video_path,subtitle_path)
+                uniform_sampled_frames = np.linspace(0, total_frame - 1, frame_num, dtype=int).tolist()
+
+                subtitle_by_frame_idx=[]
+                for frame_idx in uniform_sampled_frames:
+                    for idx,title in enumerate(subtitle_by_frame):
+                        if frame_idx<title[1] and frame_idx>=title[0]:
+                            subtitle_by_frame_idx.append(idx)
+                subtitle_by_frame_idx=list(set(subtitle_by_frame_idx))
+
+                textlist=[]
+                for idx in subtitle_by_frame_idx:
+                    pattern = r'<font color="white" size=".72c">(.*?)</font>'
+                    raw_text=re.findall(pattern, subtitle_by_frame[idx][2])
+                    try:
+                        textlist.append(raw_text[0])
+                    except:
+                        continue
+                subtitle_text="\n".join(textlist)
+        subtitle=subtitle_text
+
+    option_prompt="Select the best answer to the following multiple-choice question based on the video and the subtitles. Respond with only the letter (A, B, C, or D) of the correct option."
     question = doc["question"]
     option = str(doc["options"])
     question = question + "\n" + option + model_specific_prompt_kwargs["post_prompt"]

diff --git a/lmms_eval/tasks/videomme/videomme.yaml b/lmms_eval/tasks/videomme/videomme.yaml
diff --git a/lmms_eval/tasks/videomme/videomme_w_subtitle.yaml b/lmms_eval/tasks/videomme/videomme_w_subtitle.yaml
@@ -0,0 +1,44 @@
+dataset_path: lmms-lab/Video-MME
+dataset_kwargs:
+  token: True
+  cache_dir: videomme
+  video: True
+  # From_YouTube: True
+task: videomme_w_subtitle
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.videomme_doc_to_visual
+doc_to_text: !function utils.videomme_doc_to_text_subtitle
+doc_to_target: "answer"
+generation_kwargs:
+  max_new_tokens: 16
+  temperature: 0
+  top_p: 1.0
+  num_beams: 1
+  do_sample: false
+# The return value of process_results will be used by metrics
+process_results: !function utils.videomme_process_results
+# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
+metric_list:
+  - metric: videomme_percetion_score
+    aggregation: !function utils.videomme_aggregate_results
+    higher_is_better: true
+model_specific_prompt_kwargs:
+  default:
+    frame_num: 32
+  gemini_api:
+    gemini_api_flag: "full subtitle"
+  # gpt4v:
+  #   pre_prompt: ""
+  #   post_prompt: 
+  # # qwen_vl:  
+  # #   pre_prompt: ""
+  # #   post_prompt: " Answer:"
+  # # otterhd:
+  # #   pre_prompt: ""
+  # #   post_prompt: " Answer:"
+  # xcomposer2_4khd:
+  #   pre_prompt: "[UNUSED_TOKEN_146]user\n"
+  #   post_prompt: " Answer this question with A, B, C, or D.[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n"
+metadata:
+  - version: 0.0