EvolvingLMMs-Lab · Luodian · Jul 17, 2024 · Jul 16, 2024 · Jun 28, 2024 · Jun 28, 2024
diff --git a/lmms_eval/tasks/ai2d/ai2d_lite.yaml b/lmms_eval/tasks/ai2d/ai2d_lite.yaml
@@ -0,0 +1,56 @@
+dataset_path: lmms-lab/LMMs-Eval-Lite
+task: "ai2d_lite"
+dataset_kwargs:
+  token: True
+dataset_name: ai2d
+test_split: lite
+output_type: generate_until
+doc_to_visual: !function utils.ai2d_doc_to_visual
+doc_to_text: !function utils.ai2d_doc_to_text
+doc_to_target: !function utils.ai2d_doc_to_target
+
+model_specific_prompt_kwargs:
+  default:
+    prompt_format: mcq
+    pre_prompt: ""
+    post_prompt: "\nAnswer with the option's letter from the given choices directly."
+  gpt4v: 
+    prompt_format: mcq
+    pre_prompt: ""
+    post_prompt: "\nAbove choices are given in {option}. {content} format.\nPlease answer with the option letter from the given choices directly."
+  qwen_vl:
+    prompt_format: qa
+    pre_prompt: ""
+    post_prompt: " Answer:"
+  xcomposer2_4khd:
+    prompt_format: mcq_xcomposer
+    pre_prompt: "[UNUSED_TOKEN_146]user\nQuestion: "
+    post_prompt: "[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\nThe answer is"
+
+model_specific_target_kwargs:
+  default: "mcq"
+  qwen_vl: "qa"
+
+generation_kwargs:
+  max_new_tokens: 16
+  temperature: 0
+  do_sample: False
+
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: 0
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "([A-Z])\\."
+
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+
+metadata:
+  - version: 0.0
diff --git a/lmms_eval/tasks/chartqa/chartqa_lite.yaml b/lmms_eval/tasks/chartqa/chartqa_lite.yaml
@@ -0,0 +1,35 @@
+dataset_path: lmms-lab/LMMs-Eval-Lite
+dataset_kwargs:
+  token: True
+task: "chartqa_lite"
+dataset_name: chartqa
+test_split: lite
+output_type: generate_until
+doc_to_visual: !function utils.chartqa_doc_to_visual
+doc_to_text: !function utils.chartqa_doc_to_text
+doc_to_target: "answer"
+generation_kwargs:
+  max_new_tokens: 16
+  temperature: 0
+  do_sample: False
+process_results: !function utils.chartqa_process_results
+metric_list:
+  - metric: relaxed_overall
+    aggregation: mean
+    higher_is_better: true
+  - metric: relaxed_human_split
+    aggregation: mean
+    higher_is_better: true
+  - metric: relaxed_augmented_split
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  - version: 0.0
+model_specific_prompt_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: "\nAnswer the question with a single word."
+  qwen_vl:
+    pre_prompt: ""
+    post_prompt: " Answer:"
+
diff --git a/lmms_eval/tasks/coco_cap/coco2017_cap_val_lite.yaml b/lmms_eval/tasks/coco_cap/coco2017_cap_val_lite.yaml
@@ -0,0 +1,45 @@
+dataset_path: lmms-lab/LMMs-Eval-Lite
+dataset_kwargs:
+  token: True
+task: "coco2017_cap_val_lite"
+dataset_name: coco2017_cap_val
+test_split: lite
+output_type: generate_until
+doc_to_visual: !function utils.coco_doc_to_visual
+doc_to_text: !function utils.coco_doc_to_text
+doc_to_target: "answer"
+generation_kwargs:
+  max_new_tokens: 64
+  temperature: 0
+  top_p: 1.0
+  num_beams: 1
+  do_sample: false
+process_results: !function utils.coco_process_result
+# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
+metric_list:
+  - metric: coco_Bleu_4 
+    aggregation : !function utils.coco_bleu4
+    higher_is_better : true
+  - metric: coco_Bleu_3
+    aggregation : !function utils.coco_bleu3
+    higher_is_better : true
+  - metric: coco_Bleu_2
+    aggregation : !function utils.coco_bleu2
+    higher_is_better : true
+  - metric: coco_Bleu_1
+    aggregation : !function utils.coco_bleu1
+    higher_is_better : true
+  - metric: coco_METEOR
+    aggregation : !function utils.coco_meteor
+    higher_is_better : true
+  - metric: coco_ROUGE_L
+    aggregation : !function utils.coco_rougel
+    higher_is_better : true
+  - metric: coco_CIDEr
+    aggregation : !function utils.coco_cider
+    higher_is_better : true
+  #- metric: coco_SPICE
+  #  aggregation : !function utils.coco_spice
+  #  higher_is_better : true
+metadata:
+  - version: 0.0
diff --git a/lmms_eval/tasks/docvqa/docvqa_val_lite.yaml b/lmms_eval/tasks/docvqa/docvqa_val_lite.yaml
@@ -0,0 +1,26 @@
+task: "docvqa_val_lite"
+test_split: lite
+metric_list:
+  - metric: anls
+    aggregation: mean
+    higher_is_better: true
+dataset_path: lmms-lab/LMMs-Eval-Lite
+dataset_name: docvqa_val
+dataset_kwargs:
+  token: True
+output_type: generate_until
+doc_to_visual: !function utils.docvqa_doc_to_visual
+doc_to_text: !function utils.docvqa_doc_to_text
+doc_to_target: "answers"
+generation_kwargs:
+  max_new_tokens: 32
+  temperature: 0
+  do_sample: False
+model_specific_prompt_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: "\nAnswer the question using a single word or phrase."
+  qwen_vl:
+    pre_prompt: ""
+    post_prompt: " Answer:"
+
diff --git a/lmms_eval/tasks/flickr30k/flickr30k_test_lite.yaml b/lmms_eval/tasks/flickr30k/flickr30k_test_lite.yaml
@@ -0,0 +1,45 @@
+dataset_path: lmms-lab/LMMs-Eval-Lite
+dataset_kwargs:
+  token: True
+task : "flickr30k_test_lite"
+dataset_name: flickr30k_test
+test_split: lite
+output_type: generate_until
+doc_to_visual: !function utils.flickr_doc_to_visual
+doc_to_text: !function utils.flickr_doc_to_text
+doc_to_target: "answer"
+generation_kwargs:
+  max_new_tokens: 64
+  temperature: 0
+  top_p: 1.0
+  num_beams: 1
+  do_sample: false
+process_results: !function utils.flickr_process_result
+# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
+metric_list:
+  - metric: flickr_Bleu_4 
+    aggregation : !function utils.flickr_bleu4
+    higher_is_better : true
+  - metric: flickr_Bleu_3
+    aggregation : !function utils.flickr_bleu3
+    higher_is_better : true
+  - metric: flickr_Bleu_2
+    aggregation : !function utils.flickr_bleu2
+    higher_is_better : true
+  - metric: flickr_Bleu_1
+    aggregation : !function utils.flickr_bleu1
+    higher_is_better : true
+  - metric: flickr_METEOR
+    aggregation : !function utils.flickr_meteor
+    higher_is_better : true
+  - metric: flickr_ROUGE_L
+    aggregation : !function utils.flickr_rougel
+    higher_is_better : true
+  - metric: flickr_CIDEr
+    aggregation : !function utils.flickr_cider
+    higher_is_better : true
+  #- metric: flickr_SPICE
+  #  aggregation : !function utils.flickr_spice
+  #  higher_is_better : true
+metadata:
+  - version: 0.0
diff --git a/lmms_eval/tasks/gqa/gqa_lite.yaml b/lmms_eval/tasks/gqa/gqa_lite.yaml
@@ -0,0 +1,32 @@
+dataset_path: lmms-lab/LMMs-Eval-Lite
+dataset_name: gqa
+dataset_kwargs:
+  token: True
+task: "gqa_lite"
+test_split: lite
+output_type: generate_until
+doc_to_visual: !function utils.gqa_doc_to_visual
+doc_to_text: !function utils.gqa_doc_to_text
+doc_to_target: "answer"
+generation_kwargs:
+  max_new_tokens: 16
+  temperature: 0
+  top_p: 1.0
+  num_beams: 1
+  do_sample: false
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  - version: 0.0
+
+model_specific_prompt_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: "\nAnswer the question using a single word or phrase."
+  qwen_vl:
+    pre_prompt: ""
+    post_prompt: " Answer:"
diff --git a/lmms_eval/tasks/infovqa/infovqa_val_lite.yaml b/lmms_eval/tasks/infovqa/infovqa_val_lite.yaml
@@ -0,0 +1,22 @@
+task: "infovqa_val_lite"
+test_split: lite
+output_type: generate_until
+metric_list:
+  - metric: anls
+    aggregation: mean
+    higher_is_better: true
+dataset_path: lmms-lab/LMMs-Eval-Lite
+dataset_name: infovqa_val 
+dataset_kwargs:
+  token: True
+doc_to_target: "answers"
+doc_to_visual: !function utils.infovqa_doc_to_visual
+doc_to_text: !function utils.infovqa_doc_to_text
+generation_kwargs:
+  max_new_tokens: 32
+  temperature: 0
+  do_sample: False
+model_specific_prompt_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: "\nAnswer the question using a single word or phrase."
diff --git a/lmms_eval/tasks/mmbench/mmbench_cn_dev_lite.yaml b/lmms_eval/tasks/mmbench/mmbench_cn_dev_lite.yaml
@@ -0,0 +1,32 @@
+task: "mmbench_cn_dev_lite"
+test_split: "lite"
+metric_list:
+  - metric: gpt_eval_score
+    aggregation: !function cn_utils.mmbench_aggregate_dev_results_eval
+    higher_is_better: true
+  - metric: submission
+    higher_is_better: true
+    aggregation: !function cn_utils.mmbench_aggregate_dev_results
+dataset_path: lmms-lab/LMMs-Eval-Lite
+dataset_name: mmbench_cn_dev
+dataset_kwargs:
+  token: True
+doc_to_target: "answer"
+output_type: generate_until
+doc_to_visual: !function cn_utils.mmbench_doc_to_visual
+doc_to_text: !function cn_utils.mmbench_doc_to_text
+generation_kwargs:
+  max_new_tokens: 256
+  temperature: 0
+  top_p: 1.0
+  num_beams: 1
+  do_sample: false
+process_results: !function cn_utils.mmbench_process_results
+model_specific_prompt_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: "\n请直接使用所提供的选项字母作为答案回答。"
+model_specific_generation_kwargs:
+  llava:
+    image_aspect_ratio: original
+
diff --git a/lmms_eval/tasks/mmbench/mmbench_en_dev_lite.yaml b/lmms_eval/tasks/mmbench/mmbench_en_dev_lite.yaml
@@ -0,0 +1,35 @@
+task: "mmbench_en_dev_lite"
+test_split: lite 
+dataset_path: lmms-lab/LMMs-Eval-Lite
+dataset_name: mmbench_en_dev
+dataset_kwargs:
+  token: True
+doc_to_target: "answer"
+model_specific_prompt_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: "\nAnswer with the option's letter from the given choices directly."
+doc_to_visual: !function en_utils.mmbench_doc_to_visual
+doc_to_text: !function en_utils.mmbench_doc_to_text
+doc_to_target: "answer"
+process_results: !function en_utils.mmbench_process_results
+model_specific_generation_kwargs:
+  llava:
+    image_aspect_ratio: original
+output_type: generate_until
+generation_kwargs:
+  until:
+    - "ASSISTANT:"
+  max_new_tokens: 1024
+  temperature: 0
+  top_p: 1.0
+  num_beams: 1
+  do_sample: false
+
+metric_list:
+  - metric: gpt_eval_score
+    aggregation: !function en_utils.mmbench_aggregate_dev_results_eval
+    higher_is_better: true
+  - metric: submission
+    aggregation: !function en_utils.mmbench_aggregate_dev_results_submission
+    higher_is_better: true
diff --git a/lmms_eval/tasks/nocaps/nocaps_val_lite.yaml b/lmms_eval/tasks/nocaps/nocaps_val_lite.yaml
@@ -0,0 +1,46 @@
+dataset_path: lmms-lab/LMMs-Eval-Lite
+dataset_name: nocaps_val
+dataset_kwargs:
+  token: True
+task: "nocaps_val_lite"
+test_split: lite
+output_type: generate_until
+doc_to_visual: !function utils.nocaps_doc_to_visual
+doc_to_text: !function utils.nocaps_doc_to_text
+doc_to_target: "annotations_captions"
+generation_kwargs:
+  max_new_tokens: 64
+  temperature: 0
+  top_p: 1.0
+  num_beams: 1
+  do_sample: false
+process_results: !function utils.nocaps_process_result
+# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
+metric_list:
+  - metric: nocaps_Bleu_4 
+    aggregation : !function utils.nocaps_bleu4
+    higher_is_better : true
+  - metric: nocaps_Bleu_3
+    aggregation : !function utils.nocaps_bleu3
+    higher_is_better : true
+  - metric: nocaps_Bleu_2
+    aggregation : !function utils.nocaps_bleu2
+    higher_is_better : true
+  - metric: nocaps_Bleu_1
+    aggregation : !function utils.nocaps_bleu1
+    higher_is_better : true
+  - metric: nocaps_METEOR
+    aggregation : !function utils.nocaps_meteor
+    higher_is_better : true
+  - metric: nocaps_ROUGE_L
+    aggregation : !function utils.nocaps_rougel
+    higher_is_better : true
+  - metric: nocaps_CIDEr
+    aggregation : !function utils.nocaps_cider
+    higher_is_better : true
+  #- metric: nocaps_SPICE
+  #  aggregation : !function utils.nocaps_spice
+  #  higher_is_better : true
+metadata:
+  - version: 0.0
+include: _default_template_nocaps_yaml