EvolvingLMMs-Lab · Xian-Gao · Oct 27, 2025 · Oct 28, 2025 · kcz358 · Oct 28, 2025
diff --git a/lmms_eval/tasks/mme_sci/MME-SCI-README_EN.md b/lmms_eval/tasks/mme_sci/MME-SCI-README_EN.md
@@ -0,0 +1,91 @@
+**PAPER**: [MME-SCI](https://www.arxiv.org/abs/2508.13938)
+
+##### How to Evaluate — **Usage of run_eval.sh:**
+
+```
+./run_eval.sh <TASK> <CKPT_PATH> <CONV_TEMPLATE> <MODEL_NAME>
+```
+
+This will provide the results. An example is shown below:
+
+###### mme_sci:
+
+```
+cd lmms-eval
+./run_eval.sh mme_sci PATH/TO/Qwen2___5-VL-3B-Instruct qwen_vl qwen2_5_vl_chat
+```
+
+This will output two files:
+
+```
+lmms-eval/logs/Qwen2___5-VL-3B-Instruct/20250928_152715_results.json
+lmms-eval/logs/Qwen2___5-VL-3B-Instruct/20250928_152715_samples_mme_sci.jsonl
+```
+
+`0250928_152715` is a timestamp from lmms-eval.
+
+###### **mme_sci_image:**
+
+```
+cd lmms-eval
+./run_eval.sh mme_sci_image PATH/TO/Qwen2___5-VL-3B-Instruct qwen_vl qwen2_5_vl_chat
+```
+
+**------------------------------------------------------------------------------------------------------------------------------------------**
+
+#### Enabling SgLang for Local Evaluation
+
+**Install dependencies in a new environment to avoid conflicts:**
+
+```
+pip install --upgrade pip
+pip install uv
+uv pip install "sglang[all]>=0.4.6.post4"
+```
+
+Reference: [Install SGLang — SGLang Framework](https://docs.sglang.com.cn/start/install.html)
+
+##### **Explanation of run_judge_sglang.py:**
+
+###### Why do we need _from_ openai _import_ OpenAI?
+
+SgLang’s HTTP interface is compatible with the OpenAI style. The `api_key = os.environ.get("OPENAI_API_KEY", "sk-local")` is custom and does not require a real token. `client = OpenAI(api_key=api_key, base_url=f"http://{HOST}:{PORT}/v1")` is just an HTTP wrapper.
+
+```
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(ROOT_DIR)
+from lmms_eval.llm_judge.launcher.sglang import SGLangLauncher
+# This part uses the SGLangLauncher from "lmms-eval\lmms_eval\llm_judge\launcher\sglang.py", remember to modify it to your own path
+```
+
+Also, remember to modify `INPUT_FILE` and `OUTPUT_FILE` to your own paths.
+
+```
+python run_judge_sglang.py
+```
+
+This will directly give you the results. The terminal logs will scroll quickly because SGLang internally prints a lot of `[INFO]` and prefill logs, but you can ignore them.
+
+The judge output can be found at: lmms-eval/logs/judge_output
+
+The final result printed in the logs will look like this:
+
+zh:
+
+```
+Judging samples: 100%|███████████████████████████| 1019/1019 [02:14<00:00,  7.55it/s]
+[INFO] Judging complete.
+[INFO] Total valid samples: 1019
+[INFO] Correct: 260
+[INFO] Accuracy: 25.52%
+```
+
+img:
+
+```
+Judging samples: 100%|███████████████████████████| 1019/1019 [02:40<00:00,  6.35it/s]
+[INFO] Judging complete.
+[INFO] Total valid samples: 1019
+[INFO] Correct: 334
+[INFO] Accuracy: 32.78%
+```
diff --git a/lmms_eval/tasks/mme_sci/mme_sci.yaml b/lmms_eval/tasks/mme_sci/mme_sci.yaml
@@ -0,0 +1,34 @@
+dataset_path: JCruan/MME-SCI
+dataset_kwargs:
+  token: False
+
+task: "mme_sci"
+test_split: "train"
+output_type: "generate_until"
+
+doc_to_visual: !function utils.doc_to_visual
+doc_to_text: !function utils.doc_to_text
+doc_to_target: "answer"
+doc_to_messages: !function utils.doc_to_messages
+
+generation_kwargs:
+  max_new_tokens: 8192
+  temperature: 0
+  top_p: 1.0
+  num_beams: 1
+  do_sample: false
+
+process_results: !function utils.process_results
+
+metric_list:
+  - metric: mmesci
+    aggregation: !function utils.mmesci_agg
+    higher_is_better: true
+
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: "请在 '最终答案: ' 之后简洁地写出你给出的答案。"
+
+metadata:
+  - version: 0.0
diff --git a/lmms_eval/tasks/mme_sci/run_eval.sh b/lmms_eval/tasks/mme_sci/run_eval.sh
@@ -0,0 +1,21 @@
+TASK=$1
+CKPT_PATH=$2
+CONV_TEMPLATE=$3
+MODEL_NAME=$4
+
+echo "Task: $TASK"
+echo "Checkpoint Path: $CKPT_PATH"
+echo "Conversation Template: $CONV_TEMPLATE"
+echo "Model Name: $MODEL_NAME"
+
+TASK_SUFFIX="${TASK//,/_}"
+echo "Task Suffix: $TASK_SUFFIX"
+
+accelerate launch --num_processes 8 --main_process_port 12345 -m lmms_eval \
+    --model qwen2_5_vl \
+    --model_args pretrained=$CKPT_PATH \
+    --tasks $TASK \
+    --batch_size 1 \
+    --log_samples \
+    --log_samples_suffix $TASK_SUFFIX \
+    --output_path ./logs
diff --git a/lmms_eval/tasks/mme_sci/run_judge_saglang.py b/lmms_eval/tasks/mme_sci/run_judge_saglang.py
@@ -0,0 +1,104 @@
+import json
+import os
+import sys
+
+from openai import OpenAI
+from tqdm import tqdm
+
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(ROOT_DIR)
+from lmms_eval.llm_judge.launcher.sglang import SGLangLauncher
+
+HOST = "127.0.0.1"
+PORT = 8001
+MODEL_PATH = "Qwen2___5-VL-3B-Instruct"
+MEM_FRACTION = 0.6
+TP = 1
+TIMEOUT = 600
+
+INPUT_FILE = "lmms-eval/logs/jiangdan__Qwen2___5-VL-3B-Instruct/20250929_223245_samples_mme_sci_image.jsonl"
+OUTPUT_FILE = "lmms-eval/logs/judge_output/outputs_judged_image.jsonl"
+
+sys_prompt_of_judger = (
+    "You are a strict and impartial judge. "
+    "Based on the original question, the standard answer, and the AI assistant's response provided by the user, "
+    "determine whether the AI assistant's response is correct. "
+    "If there is any difference in meaning between the AI's response and the standard answer, reply with 'incorrect'. "
+    "If the meanings are the same, reply with 'correct'. "
+    "Important: Do not answer the original question, and do not provide reasoning or explanation. Only respond with 'correct' or 'incorrect'."
+)
+
+launcher = SGLangLauncher(
+    host=HOST,
+    port=PORT,
+    model=MODEL_PATH,
+    mem_fraction_static=MEM_FRACTION,
+    tp=TP,
+    timeout=TIMEOUT,
+    enable_torch_compile=False,
+    enable_cuda_graph=False,
+    log_level="warning",
+    log_level_http="warning",
+)
+launcher.launch()
+
+api_key = os.environ.get("OPENAI_API_KEY", "sk-local")
+client = OpenAI(api_key=api_key, base_url=f"http://{HOST}:{PORT}/v1")
+
+judged_samples = []
+
+with open(INPUT_FILE, "r", encoding="utf-8") as f:
+    lines = f.readlines()
+
+for line in tqdm(lines, desc="Judging samples"):
+    sample = json.loads(line)
+    sample_id = sample.get("sample_id")
+    question = sample.get("input")  # or "question"
+    standard_answer = sample.get("target", "").strip()
+    ai_respond = sample.get("filtered_resps", [""])[0].strip()
+
+    judge_prompt = f"""## Original Question: {question}
+
+## Standard Answer: {standard_answer}
+
+## AI Assistant's Response: {ai_respond}
+
+## NOTE: Do not answer the original question, and do not provide reasoning or explanation. Only respond with 'correct' or 'incorrect'.
+
+## Your respond:
+"""
+
+    try:
+        messages = []
+        if sys_prompt_of_judger:
+            messages.append({"role": "system", "content": sys_prompt_of_judger})
+        messages.append({"role": "user", "content": judge_prompt})
+        resp = client.chat.completions.create(
+            model=MODEL_PATH,
+            messages=messages,
+            temperature=0.0,
+            max_tokens=8,
+            timeout=TIMEOUT,
+        )
+        judge_result = resp.choices[0].message.content.strip()
+        if judge_result not in ["correct", "incorrect"]:
+            judge_result = "error"
+    except Exception as e:
+        print(f"[ERROR] sample_id={sample_id} failed: {e}")
+        judge_result = "error"
+
+    judged_samples.append({"sample_id": sample_id, "judge": judge_result, "target": standard_answer, "filtered_resps": ai_respond})
+
+with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
+    for item in judged_samples:
+        f.write(json.dumps(item, ensure_ascii=False) + "\n")
+
+valid_samples = [x for x in judged_samples if x["judge"] in ["correct", "incorrect"]]
+total = len(valid_samples)
+correct = sum(1 for x in valid_samples if x["judge"] == "correct")
+accuracy = correct / total * 100 if total > 0 else 0.0
+
+print(f"[INFO] Judging complete.")
+print(f"[INFO] Total valid samples: {total}")
+print(f"[INFO] Correct: {correct}")
+print(f"[INFO] Accuracy: {accuracy:.2f}%")
diff --git a/lmms_eval/tasks/mme_sci/utils.py b/lmms_eval/tasks/mme_sci/utils.py
@@ -0,0 +1,95 @@
+import base64
+import re
+from io import BytesIO
+from typing import Dict, List, Any
+from PIL import Image
+
+def doc_to_visual(sample: dict) -> list:
+    visual_list = []
+
+    if "image" in sample:
+        img_val = sample.get("image")
+        if img_val:
+            if img_val.startswith("data:image"):
+                img_val = re.sub(r"^data:image/[^;]+;base64,", "", img_val)
+            img = Image.open(BytesIO(base64.b64decode(img_val)))
+            if img.mode in ("RGBA", "P"):
+                img = img.convert("RGB")
+            visual_list.append(img)
+
+    question = sample.get("question", "")
+    image_tag_nums = re.findall(r"<image_(\d+)>", question)
+    for num in image_tag_nums:
+        img_col = f"image_{num}"
+        img_val = sample.get(img_col)
+        if img_val:
+            if img_val.startswith("data:image"):
+                img_val = re.sub(r"^data:image/[^;]+;base64,", "", img_val)
+            img = Image.open(BytesIO(base64.b64decode(img_val)))
+            if img.mode in ("RGBA", "P"):
+                img = img.convert("RGB")
+            visual_list.append(img)
+
+    return visual_list
+
+
+def pil_to_base64_url(img: Image.Image) -> str:
+    buffered = BytesIO()
+    img.save(buffered, format="PNG")
+    img_b64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
+    return f"data:image/png;base64,{img_b64}"
+
+
+def doc_to_text(sample: Dict[str, Any], lmms_kwargs: Dict[str, Any] = None) -> str:
+    pre_prompt = lmms_kwargs.get("pre_prompt", "") if lmms_kwargs else ""
+    post_prompt = lmms_kwargs.get("post_prompt", "") if lmms_kwargs else ""
+    question = str(sample.get("question", "")).strip()
+
+    options = sample.get("options", [])
+    if isinstance(options, dict):
+        options = list(options.values())
+    elif not isinstance(options, list):
+        options = [str(options)]
+
+    options_text = ""
+    if options:
+        letters = ["A", "B", "C", "D"]
+        options_text = "\n".join(f"{letters[i]}: {opt}" for i, opt in enumerate(options) if i < len(letters))
+
+    return f"{pre_prompt}\n{question}\n{options_text}\n{post_prompt}".strip()
+
+
+def doc_to_messages(sample: Dict[str, Any], lmms_kwargs: Dict[str, Any] = None) -> List[Dict[str, Any]]:
+    text_content = doc_to_text(sample, lmms_kwargs)
+    image_list = doc_to_visual(sample)
+
+    content = [{"type": "text", "text": text_content}]
+
+    for img in image_list:
+        buffered = BytesIO()
+        img.save(buffered, format="PNG")
+        img_b64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
+        content.append({
+            "type": "image",
+            "url": f"data:image/png;base64,{img_b64}"
+        })
+
+    return [{"role": "user", "content": content}]
+
+
+def process_results(sample, outputs, *args, **kwargs):
+    target = sample.get("answer", "").strip()
+    return {"target": target, "sample_id": sample["id"]}
+
+
+def mmesci_agg(results: List[Dict[str, Any]]) -> Dict[str, float]:
+    total = len(results)
+    if total == 0:
+        return {"accuracy": 0.0}
+
+    correct = sum(1 for r in results if r["prediction"] == r["target"])
+    return {
+        "accuracy": round(correct / total, 4),
+        "total_samples": total,
+        "correct_samples": correct,
+    }