Skip to content

Commit

Permalink
Merge pull request #129 from Dannoopsy/mmbench_ru
Browse files Browse the repository at this point in the history
add task MMBench-ru
  • Loading branch information
Luodian authored Jul 1, 2024
2 parents e19b43a + ba7081c commit 39d40de
Show file tree
Hide file tree
Showing 4 changed files with 163 additions and 0 deletions.
24 changes: 24 additions & 0 deletions lmms_eval/tasks/mmbench/_default_template_mmbench_ru_yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
dataset_path: deepvk/MMBench-ru
dataset_kwargs:
token: True
doc_to_target: "answer"
model_specific_prompt_kwargs:
default:
pre_prompt: ""
post_prompt: "\nВыбери правильный вариант ответа буквой."
doc_to_visual: !function ru_utils.mmbench_doc_to_visual
doc_to_text: !function ru_utils.mmbench_doc_to_text
doc_to_target: "answer"
process_results: !function ru_utils.mmbench_process_results
model_specific_generation_kwargs:
llava:
image_aspect_ratio: original
output_type: generate_until
generation_kwargs:
until:
- "ASSISTANT:"
max_new_tokens: 1024
temperature: 0
top_p: 1.0
num_beams: 1
do_sample: false
1 change: 1 addition & 0 deletions lmms_eval/tasks/mmbench/mmbench.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ task:
- mmbench_cn_dev
- mmbench_cn_test
- mmbench_cn_cc
- mmbench_ru_dev
metadata:
version: 0.0
sys_prompt: "There are several options:"
Expand Down
10 changes: 10 additions & 0 deletions lmms_eval/tasks/mmbench/mmbench_ru_dev.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
task: "mmbench_ru_dev"
test_split: dev
include: _default_template_mmbench_ru_yaml
metric_list:
- metric: gpt_eval_score
aggregation: !function ru_utils.mmbench_aggregate_dev_results_eval
higher_is_better: true
- metric: submission
aggregation: !function ru_utils.mmbench_aggregate_dev_results_submission
higher_is_better: true
128 changes: 128 additions & 0 deletions lmms_eval/tasks/mmbench/ru_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
import yaml
import os
from pathlib import Path
import pandas as pd
import json

from loguru import logger as eval_logger
from lmms_eval.tasks.mmbench.mmbench_evals import MMBench_Evaluator
from lmms_eval.tasks._task_utils.file_utils import generate_submission_file

with open(Path(__file__).parent / "mmbench.yaml", "r") as f:
raw_data = f.readlines()
safe_data = []
for i, line in enumerate(raw_data):
# remove function definition since yaml load cannot handle it
if "!function" not in line:
safe_data.append(line)

config = yaml.safe_load("".join(safe_data))

GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"]
API_TYPE = os.getenv("API_TYPE", "openai")

if API_TYPE == "openai":
API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
elif API_TYPE == "azure":
API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
else:
API_URL = "YOUR_API_URL"
API_KEY = "YOUR_API_KEY"


mmbench_evaluator = MMBench_Evaluator(sys_prompt=config["metadata"]["sys_prompt"], API_KEY=API_KEY, API_URL=API_URL, model_version=GPT_EVAL_MODEL_NAME)


def mmbench_doc_to_visual(doc):
return [doc["image"].convert("RGB")]


def mmbench_doc_to_text(doc, model_specific_prompt_kwargs=None):
option_candidate = ["A", "B", "C", "D", "E"]
options_prompt, options_dict = mmbench_evaluator.create_options_prompt(doc, option_candidate)

data = {
# "img": doc["image"],
"question": doc["question"],
"answer": doc.get("answer", None),
"options": options_prompt,
"category": doc["category"],
"L2-category": doc["l2-category"],
"options_dict": options_dict,
"index": doc["index"],
"hint": doc["hint"],
"source": doc["source"],
"split": doc["split"],
}

query_prompt = f"{data['hint']} {data['question']} {data['options']}" if pd.notna(data["hint"]) and data["hint"] != "nan" else f"{data['question']} {data['options']}"

if model_specific_prompt_kwargs:
query_prompt = f"{query_prompt}\n{model_specific_prompt_kwargs['post_prompt']}"

return query_prompt


def mmbench_process_results(doc, results):
model_response = results[0].strip()
data = {
"gpt_eval_score": {
"index": doc["index"],
"question": doc["question"],
"answer": doc["answer"],
"prediction": model_response,
"hint": doc["hint"],
"source": doc["source"],
"split": doc["split"],
"category": doc["category"],
"L2-category": doc["l2-category"],
},
"submission": {
"index": doc["index"],
"question": doc["question"],
"answer": doc["answer"],
"prediction": model_response,
"hint": doc["hint"],
"source": doc["source"],
"split": doc["split"],
"category": doc["category"],
"L2-category": doc["l2-category"],
},
}
option_candidate = ["A", "B", "C", "D", "E"]
for c in option_candidate:
data["submission"][c] = doc.get(c, "nan")
data["gpt_eval_score"][c] = doc.get(c, "nan")
return data


def mmbench_aggregate_dev_results_eval(results, args):
print(f"============= MMBench-RU(Dev) Detailed Results =============")
overall_acc, category_acc, l2_category_acc = mmbench_evaluator.eval_result(results, eval_method="openai")
file = generate_submission_file("mmbench_ru_dev_results.json", args)
details_info = {
"overall_acc": overall_acc,
"category_acc": category_acc,
"l2_category_acc": l2_category_acc,
}
with open(file, "w") as f:
json.dump(details_info, f)
return overall_acc * 100


def mmbench_aggregate_dev_results_submission(results, args):
df = pd.DataFrame(results)
excel_write_path = generate_submission_file("mmbench_ru_dev_results.xlsx", args)
with pd.ExcelWriter(excel_write_path) as writer:
df.to_excel(writer, index=False)
eval_logger.info(f"Saved results to {excel_write_path}")


def mmbench_aggregate_test_results(results, args):
df = pd.DataFrame(results)
excel_write_path = generate_submission_file("mmbench_ru_test_results.xlsx", args)
with pd.ExcelWriter(excel_write_path) as writer:
df.to_excel(writer, index=False)
eval_logger.info(f"Saved results to {excel_write_path}")

0 comments on commit 39d40de

Please sign in to comment.