diff --git a/lmms_eval/tasks/mme_sci/MME-SCI-README_EN.md b/lmms_eval/tasks/mme_sci/MME-SCI-README_EN.md new file mode 100644 index 000000000..b9420dc0e --- /dev/null +++ b/lmms_eval/tasks/mme_sci/MME-SCI-README_EN.md @@ -0,0 +1,91 @@ +**PAPER**: [MME-SCI](https://www.arxiv.org/abs/2508.13938) + +##### How to Evaluate — **Usage of run_eval.sh:** + +``` +./run_eval.sh +``` + +This will provide the results. An example is shown below: + +###### mme_sci: + +``` +cd lmms-eval +./run_eval.sh mme_sci PATH/TO/Qwen2___5-VL-3B-Instruct qwen_vl qwen2_5_vl_chat +``` + +This will output two files: + +``` +lmms-eval/logs/Qwen2___5-VL-3B-Instruct/20250928_152715_results.json +lmms-eval/logs/Qwen2___5-VL-3B-Instruct/20250928_152715_samples_mme_sci.jsonl +``` + +`0250928_152715` is a timestamp from lmms-eval. + +###### **mme_sci_image:** + +``` +cd lmms-eval +./run_eval.sh mme_sci_image PATH/TO/Qwen2___5-VL-3B-Instruct qwen_vl qwen2_5_vl_chat +``` + +**------------------------------------------------------------------------------------------------------------------------------------------** + +#### Enabling SgLang for Local Evaluation + +**Install dependencies in a new environment to avoid conflicts:** + +``` +pip install --upgrade pip +pip install uv +uv pip install "sglang[all]>=0.4.6.post4" +``` + +Reference: [Install SGLang — SGLang Framework](https://docs.sglang.com.cn/start/install.html) + +##### **Explanation of run_judge_sglang.py:** + +###### Why do we need _from_ openai _import_ OpenAI? + +SgLang’s HTTP interface is compatible with the OpenAI style. The `api_key = os.environ.get("OPENAI_API_KEY", "sk-local")` is custom and does not require a real token. `client = OpenAI(api_key=api_key, base_url=f"http://{HOST}:{PORT}/v1")` is just an HTTP wrapper. + +``` +ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(ROOT_DIR) +from lmms_eval.llm_judge.launcher.sglang import SGLangLauncher +# This part uses the SGLangLauncher from "lmms-eval\lmms_eval\llm_judge\launcher\sglang.py", remember to modify it to your own path +``` + +Also, remember to modify `INPUT_FILE` and `OUTPUT_FILE` to your own paths. + +``` +python run_judge_sglang.py +``` + +This will directly give you the results. The terminal logs will scroll quickly because SGLang internally prints a lot of `[INFO]` and prefill logs, but you can ignore them. + +The judge output can be found at: lmms-eval/logs/judge_output + +The final result printed in the logs will look like this: + +zh: + +``` +Judging samples: 100%|███████████████████████████| 1019/1019 [02:14<00:00, 7.55it/s] +[INFO] Judging complete. +[INFO] Total valid samples: 1019 +[INFO] Correct: 260 +[INFO] Accuracy: 25.52% +``` + +img: + +``` +Judging samples: 100%|███████████████████████████| 1019/1019 [02:40<00:00, 6.35it/s] +[INFO] Judging complete. +[INFO] Total valid samples: 1019 +[INFO] Correct: 334 +[INFO] Accuracy: 32.78% +``` diff --git a/lmms_eval/tasks/mme_sci/mme_sci.yaml b/lmms_eval/tasks/mme_sci/mme_sci.yaml new file mode 100644 index 000000000..bf03b8fed --- /dev/null +++ b/lmms_eval/tasks/mme_sci/mme_sci.yaml @@ -0,0 +1,34 @@ +dataset_path: JCruan/MME-SCI +dataset_kwargs: + token: False + +task: "mme_sci" +test_split: "train" +output_type: "generate_until" + +doc_to_visual: !function utils.doc_to_visual +doc_to_text: !function utils.doc_to_text +doc_to_target: "answer" +doc_to_messages: !function utils.doc_to_messages + +generation_kwargs: + max_new_tokens: 8192 + temperature: 0 + top_p: 1.0 + num_beams: 1 + do_sample: false + +process_results: !function utils.process_results + +metric_list: + - metric: mmesci + aggregation: !function utils.mmesci_agg + higher_is_better: true + +lmms_eval_specific_kwargs: + default: + pre_prompt: "" + post_prompt: "请在 '最终答案: ' 之后简洁地写出你给出的答案。" + +metadata: + - version: 0.0 diff --git a/lmms_eval/tasks/mme_sci/run_eval.sh b/lmms_eval/tasks/mme_sci/run_eval.sh new file mode 100644 index 000000000..846e94d1f --- /dev/null +++ b/lmms_eval/tasks/mme_sci/run_eval.sh @@ -0,0 +1,21 @@ +TASK=$1 +CKPT_PATH=$2 +CONV_TEMPLATE=$3 +MODEL_NAME=$4 + +echo "Task: $TASK" +echo "Checkpoint Path: $CKPT_PATH" +echo "Conversation Template: $CONV_TEMPLATE" +echo "Model Name: $MODEL_NAME" + +TASK_SUFFIX="${TASK//,/_}" +echo "Task Suffix: $TASK_SUFFIX" + +accelerate launch --num_processes 8 --main_process_port 12345 -m lmms_eval \ + --model qwen2_5_vl \ + --model_args pretrained=$CKPT_PATH \ + --tasks $TASK \ + --batch_size 1 \ + --log_samples \ + --log_samples_suffix $TASK_SUFFIX \ + --output_path ./logs \ No newline at end of file diff --git a/lmms_eval/tasks/mme_sci/run_judge_saglang.py b/lmms_eval/tasks/mme_sci/run_judge_saglang.py new file mode 100644 index 000000000..fcdd6a25c --- /dev/null +++ b/lmms_eval/tasks/mme_sci/run_judge_saglang.py @@ -0,0 +1,104 @@ +import json +import os +import sys + +from openai import OpenAI +from tqdm import tqdm + +ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(ROOT_DIR) +from lmms_eval.llm_judge.launcher.sglang import SGLangLauncher + +HOST = "127.0.0.1" +PORT = 8001 +MODEL_PATH = "Qwen2___5-VL-3B-Instruct" +MEM_FRACTION = 0.6 +TP = 1 +TIMEOUT = 600 + +INPUT_FILE = "lmms-eval/logs/jiangdan__Qwen2___5-VL-3B-Instruct/20250929_223245_samples_mme_sci_image.jsonl" +OUTPUT_FILE = "lmms-eval/logs/judge_output/outputs_judged_image.jsonl" + +sys_prompt_of_judger = ( + "You are a strict and impartial judge. " + "Based on the original question, the standard answer, and the AI assistant's response provided by the user, " + "determine whether the AI assistant's response is correct. " + "If there is any difference in meaning between the AI's response and the standard answer, reply with 'incorrect'. " + "If the meanings are the same, reply with 'correct'. " + "Important: Do not answer the original question, and do not provide reasoning or explanation. Only respond with 'correct' or 'incorrect'." +) + +launcher = SGLangLauncher( + host=HOST, + port=PORT, + model=MODEL_PATH, + mem_fraction_static=MEM_FRACTION, + tp=TP, + timeout=TIMEOUT, + enable_torch_compile=False, + enable_cuda_graph=False, + log_level="warning", + log_level_http="warning", +) +launcher.launch() + +api_key = os.environ.get("OPENAI_API_KEY", "sk-local") +client = OpenAI(api_key=api_key, base_url=f"http://{HOST}:{PORT}/v1") + +judged_samples = [] + +with open(INPUT_FILE, "r", encoding="utf-8") as f: + lines = f.readlines() + +for line in tqdm(lines, desc="Judging samples"): + sample = json.loads(line) + sample_id = sample.get("sample_id") + question = sample.get("input") # or "question" + standard_answer = sample.get("target", "").strip() + ai_respond = sample.get("filtered_resps", [""])[0].strip() + + judge_prompt = f"""## Original Question: {question} + +## Standard Answer: {standard_answer} + +## AI Assistant's Response: {ai_respond} + +## NOTE: Do not answer the original question, and do not provide reasoning or explanation. Only respond with 'correct' or 'incorrect'. + +## Your respond: +""" + + try: + messages = [] + if sys_prompt_of_judger: + messages.append({"role": "system", "content": sys_prompt_of_judger}) + messages.append({"role": "user", "content": judge_prompt}) + resp = client.chat.completions.create( + model=MODEL_PATH, + messages=messages, + temperature=0.0, + max_tokens=8, + timeout=TIMEOUT, + ) + judge_result = resp.choices[0].message.content.strip() + if judge_result not in ["correct", "incorrect"]: + judge_result = "error" + except Exception as e: + print(f"[ERROR] sample_id={sample_id} failed: {e}") + judge_result = "error" + + judged_samples.append({"sample_id": sample_id, "judge": judge_result, "target": standard_answer, "filtered_resps": ai_respond}) + +with open(OUTPUT_FILE, "w", encoding="utf-8") as f: + for item in judged_samples: + f.write(json.dumps(item, ensure_ascii=False) + "\n") + +valid_samples = [x for x in judged_samples if x["judge"] in ["correct", "incorrect"]] +total = len(valid_samples) +correct = sum(1 for x in valid_samples if x["judge"] == "correct") +accuracy = correct / total * 100 if total > 0 else 0.0 + +print(f"[INFO] Judging complete.") +print(f"[INFO] Total valid samples: {total}") +print(f"[INFO] Correct: {correct}") +print(f"[INFO] Accuracy: {accuracy:.2f}%") diff --git a/lmms_eval/tasks/mme_sci/utils.py b/lmms_eval/tasks/mme_sci/utils.py new file mode 100644 index 000000000..e19a57ab1 --- /dev/null +++ b/lmms_eval/tasks/mme_sci/utils.py @@ -0,0 +1,95 @@ +import base64 +import re +from io import BytesIO +from typing import Dict, List, Any +from PIL import Image + +def doc_to_visual(sample: dict) -> list: + visual_list = [] + + if "image" in sample: + img_val = sample.get("image") + if img_val: + if img_val.startswith("data:image"): + img_val = re.sub(r"^data:image/[^;]+;base64,", "", img_val) + img = Image.open(BytesIO(base64.b64decode(img_val))) + if img.mode in ("RGBA", "P"): + img = img.convert("RGB") + visual_list.append(img) + + question = sample.get("question", "") + image_tag_nums = re.findall(r"", question) + for num in image_tag_nums: + img_col = f"image_{num}" + img_val = sample.get(img_col) + if img_val: + if img_val.startswith("data:image"): + img_val = re.sub(r"^data:image/[^;]+;base64,", "", img_val) + img = Image.open(BytesIO(base64.b64decode(img_val))) + if img.mode in ("RGBA", "P"): + img = img.convert("RGB") + visual_list.append(img) + + return visual_list + + +def pil_to_base64_url(img: Image.Image) -> str: + buffered = BytesIO() + img.save(buffered, format="PNG") + img_b64 = base64.b64encode(buffered.getvalue()).decode("utf-8") + return f"data:image/png;base64,{img_b64}" + + +def doc_to_text(sample: Dict[str, Any], lmms_kwargs: Dict[str, Any] = None) -> str: + pre_prompt = lmms_kwargs.get("pre_prompt", "") if lmms_kwargs else "" + post_prompt = lmms_kwargs.get("post_prompt", "") if lmms_kwargs else "" + question = str(sample.get("question", "")).strip() + + options = sample.get("options", []) + if isinstance(options, dict): + options = list(options.values()) + elif not isinstance(options, list): + options = [str(options)] + + options_text = "" + if options: + letters = ["A", "B", "C", "D"] + options_text = "\n".join(f"{letters[i]}: {opt}" for i, opt in enumerate(options) if i < len(letters)) + + return f"{pre_prompt}\n{question}\n{options_text}\n{post_prompt}".strip() + + +def doc_to_messages(sample: Dict[str, Any], lmms_kwargs: Dict[str, Any] = None) -> List[Dict[str, Any]]: + text_content = doc_to_text(sample, lmms_kwargs) + image_list = doc_to_visual(sample) + + content = [{"type": "text", "text": text_content}] + + for img in image_list: + buffered = BytesIO() + img.save(buffered, format="PNG") + img_b64 = base64.b64encode(buffered.getvalue()).decode("utf-8") + content.append({ + "type": "image", + "url": f"data:image/png;base64,{img_b64}" + }) + + return [{"role": "user", "content": content}] + + +def process_results(sample, outputs, *args, **kwargs): + target = sample.get("answer", "").strip() + return {"target": target, "sample_id": sample["id"]} + + +def mmesci_agg(results: List[Dict[str, Any]]) -> Dict[str, float]: + total = len(results) + if total == 0: + return {"accuracy": 0.0} + + correct = sum(1 for r in results if r["prediction"] == r["target"]) + return { + "accuracy": round(correct / total, 4), + "total_samples": total, + "correct_samples": correct, + } \ No newline at end of file diff --git a/lmms_eval/tasks/mme_sci_image/MME-SCI-README_EN.md b/lmms_eval/tasks/mme_sci_image/MME-SCI-README_EN.md new file mode 100644 index 000000000..b9420dc0e --- /dev/null +++ b/lmms_eval/tasks/mme_sci_image/MME-SCI-README_EN.md @@ -0,0 +1,91 @@ +**PAPER**: [MME-SCI](https://www.arxiv.org/abs/2508.13938) + +##### How to Evaluate — **Usage of run_eval.sh:** + +``` +./run_eval.sh +``` + +This will provide the results. An example is shown below: + +###### mme_sci: + +``` +cd lmms-eval +./run_eval.sh mme_sci PATH/TO/Qwen2___5-VL-3B-Instruct qwen_vl qwen2_5_vl_chat +``` + +This will output two files: + +``` +lmms-eval/logs/Qwen2___5-VL-3B-Instruct/20250928_152715_results.json +lmms-eval/logs/Qwen2___5-VL-3B-Instruct/20250928_152715_samples_mme_sci.jsonl +``` + +`0250928_152715` is a timestamp from lmms-eval. + +###### **mme_sci_image:** + +``` +cd lmms-eval +./run_eval.sh mme_sci_image PATH/TO/Qwen2___5-VL-3B-Instruct qwen_vl qwen2_5_vl_chat +``` + +**------------------------------------------------------------------------------------------------------------------------------------------** + +#### Enabling SgLang for Local Evaluation + +**Install dependencies in a new environment to avoid conflicts:** + +``` +pip install --upgrade pip +pip install uv +uv pip install "sglang[all]>=0.4.6.post4" +``` + +Reference: [Install SGLang — SGLang Framework](https://docs.sglang.com.cn/start/install.html) + +##### **Explanation of run_judge_sglang.py:** + +###### Why do we need _from_ openai _import_ OpenAI? + +SgLang’s HTTP interface is compatible with the OpenAI style. The `api_key = os.environ.get("OPENAI_API_KEY", "sk-local")` is custom and does not require a real token. `client = OpenAI(api_key=api_key, base_url=f"http://{HOST}:{PORT}/v1")` is just an HTTP wrapper. + +``` +ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(ROOT_DIR) +from lmms_eval.llm_judge.launcher.sglang import SGLangLauncher +# This part uses the SGLangLauncher from "lmms-eval\lmms_eval\llm_judge\launcher\sglang.py", remember to modify it to your own path +``` + +Also, remember to modify `INPUT_FILE` and `OUTPUT_FILE` to your own paths. + +``` +python run_judge_sglang.py +``` + +This will directly give you the results. The terminal logs will scroll quickly because SGLang internally prints a lot of `[INFO]` and prefill logs, but you can ignore them. + +The judge output can be found at: lmms-eval/logs/judge_output + +The final result printed in the logs will look like this: + +zh: + +``` +Judging samples: 100%|███████████████████████████| 1019/1019 [02:14<00:00, 7.55it/s] +[INFO] Judging complete. +[INFO] Total valid samples: 1019 +[INFO] Correct: 260 +[INFO] Accuracy: 25.52% +``` + +img: + +``` +Judging samples: 100%|███████████████████████████| 1019/1019 [02:40<00:00, 6.35it/s] +[INFO] Judging complete. +[INFO] Total valid samples: 1019 +[INFO] Correct: 334 +[INFO] Accuracy: 32.78% +``` diff --git a/lmms_eval/tasks/mme_sci_image/mme_sci_image.yaml b/lmms_eval/tasks/mme_sci_image/mme_sci_image.yaml new file mode 100644 index 000000000..50105b284 --- /dev/null +++ b/lmms_eval/tasks/mme_sci_image/mme_sci_image.yaml @@ -0,0 +1,37 @@ +dataset_path: JCruan/MME-SCI +dataset_kwargs: + token: False + + +task: "mme_sci_image" +test_split: "train" +output_type: "generate_until" + + +doc_to_visual: !function utils.doc_to_visual +doc_to_text: !function utils.doc_to_text +doc_to_target: "answer" +doc_to_messages: !function utils.doc_to_messages + +generation_kwargs: + max_new_tokens: 8192 + temperature: 0 + top_p: 1.0 + num_beams: 1 + do_sample: false + +process_results: !function utils.process_results + +metric_list: + - metric: mmesci + aggregation: !function utils.mmesci_agg + higher_is_better: true + +lmms_eval_specific_kwargs: + default: + pre_prompt: "" + post_prompt: "请解答图像中给定的问题,并且在 '最终答案: ' 之后简洁地写出你的答案。" + +metadata: + - version: 0.0 + diff --git a/lmms_eval/tasks/mme_sci_image/run_eval.sh b/lmms_eval/tasks/mme_sci_image/run_eval.sh new file mode 100644 index 000000000..846e94d1f --- /dev/null +++ b/lmms_eval/tasks/mme_sci_image/run_eval.sh @@ -0,0 +1,21 @@ +TASK=$1 +CKPT_PATH=$2 +CONV_TEMPLATE=$3 +MODEL_NAME=$4 + +echo "Task: $TASK" +echo "Checkpoint Path: $CKPT_PATH" +echo "Conversation Template: $CONV_TEMPLATE" +echo "Model Name: $MODEL_NAME" + +TASK_SUFFIX="${TASK//,/_}" +echo "Task Suffix: $TASK_SUFFIX" + +accelerate launch --num_processes 8 --main_process_port 12345 -m lmms_eval \ + --model qwen2_5_vl \ + --model_args pretrained=$CKPT_PATH \ + --tasks $TASK \ + --batch_size 1 \ + --log_samples \ + --log_samples_suffix $TASK_SUFFIX \ + --output_path ./logs \ No newline at end of file diff --git a/lmms_eval/tasks/mme_sci_image/run_judge_saglang.py b/lmms_eval/tasks/mme_sci_image/run_judge_saglang.py new file mode 100644 index 000000000..fcdd6a25c --- /dev/null +++ b/lmms_eval/tasks/mme_sci_image/run_judge_saglang.py @@ -0,0 +1,104 @@ +import json +import os +import sys + +from openai import OpenAI +from tqdm import tqdm + +ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(ROOT_DIR) +from lmms_eval.llm_judge.launcher.sglang import SGLangLauncher + +HOST = "127.0.0.1" +PORT = 8001 +MODEL_PATH = "Qwen2___5-VL-3B-Instruct" +MEM_FRACTION = 0.6 +TP = 1 +TIMEOUT = 600 + +INPUT_FILE = "lmms-eval/logs/jiangdan__Qwen2___5-VL-3B-Instruct/20250929_223245_samples_mme_sci_image.jsonl" +OUTPUT_FILE = "lmms-eval/logs/judge_output/outputs_judged_image.jsonl" + +sys_prompt_of_judger = ( + "You are a strict and impartial judge. " + "Based on the original question, the standard answer, and the AI assistant's response provided by the user, " + "determine whether the AI assistant's response is correct. " + "If there is any difference in meaning between the AI's response and the standard answer, reply with 'incorrect'. " + "If the meanings are the same, reply with 'correct'. " + "Important: Do not answer the original question, and do not provide reasoning or explanation. Only respond with 'correct' or 'incorrect'." +) + +launcher = SGLangLauncher( + host=HOST, + port=PORT, + model=MODEL_PATH, + mem_fraction_static=MEM_FRACTION, + tp=TP, + timeout=TIMEOUT, + enable_torch_compile=False, + enable_cuda_graph=False, + log_level="warning", + log_level_http="warning", +) +launcher.launch() + +api_key = os.environ.get("OPENAI_API_KEY", "sk-local") +client = OpenAI(api_key=api_key, base_url=f"http://{HOST}:{PORT}/v1") + +judged_samples = [] + +with open(INPUT_FILE, "r", encoding="utf-8") as f: + lines = f.readlines() + +for line in tqdm(lines, desc="Judging samples"): + sample = json.loads(line) + sample_id = sample.get("sample_id") + question = sample.get("input") # or "question" + standard_answer = sample.get("target", "").strip() + ai_respond = sample.get("filtered_resps", [""])[0].strip() + + judge_prompt = f"""## Original Question: {question} + +## Standard Answer: {standard_answer} + +## AI Assistant's Response: {ai_respond} + +## NOTE: Do not answer the original question, and do not provide reasoning or explanation. Only respond with 'correct' or 'incorrect'. + +## Your respond: +""" + + try: + messages = [] + if sys_prompt_of_judger: + messages.append({"role": "system", "content": sys_prompt_of_judger}) + messages.append({"role": "user", "content": judge_prompt}) + resp = client.chat.completions.create( + model=MODEL_PATH, + messages=messages, + temperature=0.0, + max_tokens=8, + timeout=TIMEOUT, + ) + judge_result = resp.choices[0].message.content.strip() + if judge_result not in ["correct", "incorrect"]: + judge_result = "error" + except Exception as e: + print(f"[ERROR] sample_id={sample_id} failed: {e}") + judge_result = "error" + + judged_samples.append({"sample_id": sample_id, "judge": judge_result, "target": standard_answer, "filtered_resps": ai_respond}) + +with open(OUTPUT_FILE, "w", encoding="utf-8") as f: + for item in judged_samples: + f.write(json.dumps(item, ensure_ascii=False) + "\n") + +valid_samples = [x for x in judged_samples if x["judge"] in ["correct", "incorrect"]] +total = len(valid_samples) +correct = sum(1 for x in valid_samples if x["judge"] == "correct") +accuracy = correct / total * 100 if total > 0 else 0.0 + +print(f"[INFO] Judging complete.") +print(f"[INFO] Total valid samples: {total}") +print(f"[INFO] Correct: {correct}") +print(f"[INFO] Accuracy: {accuracy:.2f}%") diff --git a/lmms_eval/tasks/mme_sci_image/utils.py b/lmms_eval/tasks/mme_sci_image/utils.py new file mode 100644 index 000000000..d976da5b7 --- /dev/null +++ b/lmms_eval/tasks/mme_sci_image/utils.py @@ -0,0 +1,96 @@ +import base64 +import re +from io import BytesIO +from typing import Dict, List, Any +from PIL import Image + +def doc_to_visual(sample: dict) -> list: + visual_list = [] + + if "image" in sample: + img_val = sample.get("image") + if img_val: + if img_val.startswith("data:image"): + img_val = re.sub(r"^data:image/[^;]+;base64,", "", img_val) + img = Image.open(BytesIO(base64.b64decode(img_val))) + if img.mode in ("RGBA", "P"): + img = img.convert("RGB") + visual_list.append(img) + + question = sample.get("question", "") + image_tag_nums = re.findall(r"", question) + for num in image_tag_nums: + img_col = f"image_{num}" + img_val = sample.get(img_col) + if img_val: + if img_val.startswith("data:image"): + img_val = re.sub(r"^data:image/[^;]+;base64,", "", img_val) + img = Image.open(BytesIO(base64.b64decode(img_val))) + if img.mode in ("RGBA", "P"): + img = img.convert("RGB") + visual_list.append(img) + + return visual_list + + +def pil_to_base64_url(img: Image.Image) -> str: + buffered = BytesIO() + img.save(buffered, format="PNG") + img_b64 = base64.b64encode(buffered.getvalue()).decode("utf-8") + return f"data:image/png;base64,{img_b64}" + + +def doc_to_text(sample: Dict[str, Any], lmms_kwargs: Dict[str, Any] = None) -> str: + pre_prompt = lmms_kwargs.get("pre_prompt", "") if lmms_kwargs else "" + post_prompt = lmms_kwargs.get("post_prompt", "") if lmms_kwargs else "" + question = str(sample.get("question", "")).strip() + + options = sample.get("options", []) + if isinstance(options, dict): + options = list(options.values()) + elif not isinstance(options, list): + options = [str(options)] + + options_text = "" + if options: + letters = ["A", "B", "C", "D"] + options_text = "\n".join(f"{letters[i]}: {opt}" for i, opt in enumerate(options) if i < len(letters)) + + return f"{pre_prompt}\n{question}\n{options_text}\n{post_prompt}".strip() + + +def doc_to_messages(sample: Dict[str, Any], lmms_kwargs: Dict[str, Any] = None) -> List[Dict[str, Any]]: + text_content = doc_to_text(sample, lmms_kwargs) + image_list = doc_to_visual(sample) + + content = [{"type": "text", "text": text_content}] + + for img in image_list: + buffered = BytesIO() + img.save(buffered, format="PNG") + img_b64 = base64.b64encode(buffered.getvalue()).decode("utf-8") + content.append({ + "type": "image", + "url": f"data:image/png;base64,{img_b64}" + }) + + return [{"role": "user", "content": content}] + + +def process_results(sample, outputs, *args, **kwargs): + target = sample.get("answer", "").strip() + return {"target": target, "sample_id": sample["id"]} + + +def mmesci_agg(results: List[Dict[str, Any]]) -> Dict[str, float]: + total = len(results) + if total == 0: + return {"accuracy": 0.0} + + correct = sum(1 for r in results if r["prediction"] == r["target"]) + return { + "accuracy": round(correct / total, 4), + "total_samples": total, + "correct_samples": correct, + } + \ No newline at end of file