Skip to content

Commit

Permalink
add vinoground
Browse files Browse the repository at this point in the history
  • Loading branch information
jzhang2427 committed Oct 16, 2024
1 parent 8c2d89b commit 2d466f7
Show file tree
Hide file tree
Showing 2 changed files with 140 additions and 0 deletions.
109 changes: 109 additions & 0 deletions lmms_eval/tasks/vinoground/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import os
from pathlib import Path

import numpy as np
import yaml
from loguru import logger as eval_logger

hf_home = os.getenv("HF_HOME", "~/.cache/huggingface/")
base_cache_dir = os.path.expanduser(hf_home)
with open(Path(__file__).parent / "vinoground.yaml", "r") as f:
raw_data = f.readlines()
safe_data = []
for i, line in enumerate(raw_data):
if "!function" not in line:
safe_data.append(line)
cache_name = yaml.safe_load("".join(safe_data))["dataset_kwargs"]["cache_dir"]


def vinoground_doc_to_visual(doc):
cache_dir = os.path.join(base_cache_dir, cache_name)

if doc["index"].split("_")[2] == "text":
video_path = os.path.join(cache_dir, "vinoground_videos", "_".join(doc["index"].split("_")[:2]) + ".mp4")
else:
video_path = os.path.join(cache_dir, "vinoground_videos_concated", doc["index"].split("_")[0] + ".mp4")
if not os.path.exists(video_path):
raise Exception(f"video path:{video_path} does not exist, please check")
return [video_path]


def vinoground_doc_to_text(doc, lmms_eval_specific_kwargs=None):
if doc["index"].split("_")[2] == "text":
pre_prompt = "Which caption best describes this video?"
option_a = "A. " + doc["pos_cap"]
option_b = "B. " + doc["neg_cap"]
post_prompt = "Answer with the option's letter from the given choices directly. Please only output 1 English character."
full_prompt = pre_prompt + "\n" + option_a + "\n" + option_b + "\n" + post_prompt
else:
pos_neg = doc["index"].split("_")[1]
caption_in_question = doc[f"{pos_neg}_cap"]
pre_prompt = "Which video segment matches this caption? Note: The video contains two segments separated by a 2-second black frame."
caption = f"Caption: {caption_in_question}"
options = "A. The first fragment (before black frame)\nB. The second fragment (after black frame)"
post_prompt = "Answer with the option's letter from the given choices directly. Please only output 1 English character."
full_prompt = pre_prompt + "\n" + caption + "\n" + options + "\n" + post_prompt
return full_prompt


def vinoground_process_results(doc, results):
pred = results[0]

major = doc["major"]
minors = doc["minor"]
categories = [major]
if minors is not None:
categories.extend(minors.split(";"))
question_type = doc["index"].split("_")[2]
data_dict = {"index": doc["index"], "categories": categories, "question_type": question_type, "pred": pred}

return {"vinoground_score": data_dict}


def vinoground_aggregate_results(results):
matrix = np.zeros((500, 7), dtype=np.int8)

category_all = {}
category_text = {}
category_video = {}
category_group = {}
index_to_categories = {}

for result in results:
index, categories, question_type, pred = result["index"], result["categories"], result["question_type"], result["pred"]
matrix_col = 0 if "pos" in index else 1
if question_type == "video":
matrix_col += 3
gt = "A" if "pos" in index else "B"
idx = int(index.split("_")[0])
matrix[idx, matrix_col] = pred[0].lower() == gt.lower()

categories.append("all")
if idx not in index_to_categories.keys():
index_to_categories[idx] = categories

matrix[:, 2] = matrix[:, 0] & matrix[:, 1]
matrix[:, 5] = matrix[:, 3] & matrix[:, 4]
matrix[:, 6] = matrix[:, 2] & matrix[:, 5]

for i in range(500):
for category in index_to_categories[i]:
if category not in category_all.keys():
category_all[category] = 0
category_text[category] = 0
category_video[category] = 0
category_group[category] = 0

category_all[category] += 1
category_text[category] += matrix[i, 2]
category_video[category] += matrix[i, 5]
category_group[category] += matrix[i, 6]

loginfo = "Categorical results:\n"
for category in category_all.keys():
loginfo += (
f"{category}: text: {category_text[category] / category_all[category] * 100:.2f}%, video: {category_video[category] / category_all[category] * 100:.2f}%, group: {category_group[category] / category_all[category] * 100:.2f}%\n"
)
eval_logger.info(loginfo)

return matrix[:, 2].mean() * 100, matrix[:, 5].mean() * 100, matrix[:, 6].mean() * 100
31 changes: 31 additions & 0 deletions lmms_eval/tasks/vinoground/vinoground.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
dataset_path: HanSolo9682/Vinoground
dataset_kwargs:
token: True
cache_dir: vinoground
video: True

task: vinoground
test_split: lmmseval
output_type: generate_until
doc_to_visual: !function utils.vinoground_doc_to_visual
doc_to_text: !function utils.vinoground_doc_to_text
doc_to_target: "answer"
generation_kwargs:
max_new_tokens: 16
temperature: 0
top_p: 1.0
num_beams: 1
do_sample: false

process_results: !function utils.vinoground_process_results

metric_list:
- metric: vinoground_score
aggregation: !function utils.vinoground_aggregate_results
higher_is_better: true
lmms_eval_specific_kwargs:
default:
pre_prompt: ""
post_prompt: "\nAnswer with the option's letter from the given choices directly. Please only output one English character."
metadata:
- version: 0.0

0 comments on commit 2d466f7

Please sign in to comment.