add conbench

EvolvingLMMs-Lab · Jun 5, 2024 · 5fd6845 · 5fd6845
1 parent 8ee7848
commit 5fd6845
Show file tree

Hide file tree

Showing 2 changed files with 124 additions and 0 deletions.
diff --git a/lmms_eval/tasks/conbench/conbench.yaml b/lmms_eval/tasks/conbench/conbench.yaml
@@ -0,0 +1,24 @@
+dataset_path: ConBench/ConBench_D
+dataset_kwargs:
+  token: True
+task: "ConBench"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.conbench_doc_to_visual
+doc_to_text: !function utils.conbench_doc_to_text
+doc_to_target: "answer"
+generation_kwargs:
+  max_new_tokens: 1024
+  temperature: 0.2
+  top_p: 0
+  num_beams: 1
+  do_sample: True
+# The return value of process_results will be used by metrics
+process_results: !function utils.conbench_process_results
+# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
+metric_list:
+  - metric: ConScore_D
+    aggregation: !function utils.conbench_aggregate_results
+    higher_is_better: true
+metadata:
+  - version: 0.0
diff --git a/lmms_eval/tasks/conbench/utils.py b/lmms_eval/tasks/conbench/utils.py
@@ -0,0 +1,100 @@
+from collections import defaultdict
+import os
+from anls import anls_score
+
+import logging
+
+eval_logger = logging.getLogger("lmms-eval")
+
+dir_name = os.path.dirname(os.path.abspath(__file__))
+
+# 19 classes
+eval_type_dict = {
+    "Sensation": ["count","color", "scene", "poster", "attribute_recognition", "ocr", "position"],
+    "Cognition": ["calculation", "code", "translation", "math", "cross_instance_reason", "attribute_reason"],
+    "Knowledge": ["celebrity", "chemistry", "physics", "biology", "landmark", "artwork"]
+}
+
+
+def conbench_doc_to_visual(doc):
+    return [doc["image"].convert("RGB")]
+
+
+def conbench_doc_to_text(doc):
+    question = doc["question"].strip()
+    return question
+
+
+def parse_pred_ans_NY(pred_ans):
+    pred_label = None
+    if pred_ans in ["yes", "no"]:
+        pred_label = pred_ans
+    else:
+        prefix_pred_ans = pred_ans[:4]
+
+        if "yes" in prefix_pred_ans:
+            pred_label = "yes"
+        elif "no" in prefix_pred_ans:
+            pred_label = "no"
+        else:
+            pred_label = "other"
+    return pred_label
+
+
+def parse_pred_ans_choice(pred_ans):
+    return pred_ans.replace(" ", "")[0]
+
+
+def conbench_process_results(doc, results):
+    """
+    Args:
+        doc: a instance of the eval dataset
+        results: [pred]
+    Returns:
+        a dictionary with key: metric name (in this case mme score), value: metric value
+    """
+    pred = results[0]
+    pred = pred.replace('\n', "").lower()
+    # parser
+    if doc["question_field"] == "N/Y":
+        pred_ans = parse_pred_ans_NY(pred)
+    elif doc["question_field"] == "Choices":
+        pred_ans = parse_pred_ans_choice(pred)
+    else:
+        pred_ans = pred
+
+    gt_ans = doc["answer"].lower()
+
+    # score
+    score = 1 if (doc["question_field"] == "Q/A" and anls_score(prediction=pred_ans, gold_labels=[gt_ans], threshold=0.95) >= 0.4) \
+                    or (gt_ans == pred_ans) \
+            else 0
+    # Note: the key name here is very important. It decides which aggregation function will receive the results
+    # We note down the question id/category to help us aggregate the results later
+    return {"ConScore_D":{"image_id": doc["image_id"], "question_field": doc["question_field"], "score": score}}
+
+
+def conbench_aggregate_results(results):
+    """
+    Args:
+        results: a list of values returned by process_results
+    Returns:
+        A score
+    """
+    summary = defaultdict(dict)
+    for result in results:
+        image_id = result["image_id"]
+        score = result["score"]
+        if image_id not in summary.keys():
+            summary[image_id] = 0
+        summary[image_id] += score
+
+    cnt_con = 0
+    for image_id, score in summary.items():
+        if score == 3:
+            cnt_con += 1
+
+    print("Consistency Cases are ", cnt_con)
+    cnt_con = cnt_con / (len(results) / 3)
+    eval_logger.info(f"ConScore_D: {cnt_con:.2f}")
+    return cnt_con