Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add LMMs-Lite #148

Merged
merged 6 commits into from
Jul 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions lmms_eval/tasks/ai2d/ai2d_lite.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
dataset_path: lmms-lab/LMMs-Eval-Lite
task: "ai2d_lite"
dataset_kwargs:
token: True
dataset_name: ai2d
test_split: lite
output_type: generate_until
doc_to_visual: !function utils.ai2d_doc_to_visual
doc_to_text: !function utils.ai2d_doc_to_text
doc_to_target: !function utils.ai2d_doc_to_target

model_specific_prompt_kwargs:
default:
prompt_format: mcq
pre_prompt: ""
post_prompt: "\nAnswer with the option's letter from the given choices directly."
gpt4v:
prompt_format: mcq
pre_prompt: ""
post_prompt: "\nAbove choices are given in {option}. {content} format.\nPlease answer with the option letter from the given choices directly."
qwen_vl:
prompt_format: qa
pre_prompt: ""
post_prompt: " Answer:"
xcomposer2_4khd:
prompt_format: mcq_xcomposer
pre_prompt: "[UNUSED_TOKEN_146]user\nQuestion: "
post_prompt: "[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\nThe answer is"

model_specific_target_kwargs:
default: "mcq"
qwen_vl: "qa"

generation_kwargs:
max_new_tokens: 16
temperature: 0
do_sample: False

filter_list:
- name: "flexible-extract"
filter:
- function: !function utils.MultiChoiceRegexFilter
group_select: 0
ignore_case: true
ignore_punctuation: true
regex_pattern: "([A-Z])\\."

metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true

metadata:
- version: 0.0
35 changes: 35 additions & 0 deletions lmms_eval/tasks/chartqa/chartqa_lite.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
dataset_path: lmms-lab/LMMs-Eval-Lite
dataset_kwargs:
token: True
task: "chartqa_lite"
dataset_name: chartqa
test_split: lite
output_type: generate_until
doc_to_visual: !function utils.chartqa_doc_to_visual
doc_to_text: !function utils.chartqa_doc_to_text
doc_to_target: "answer"
generation_kwargs:
max_new_tokens: 16
temperature: 0
do_sample: False
process_results: !function utils.chartqa_process_results
metric_list:
- metric: relaxed_overall
aggregation: mean
higher_is_better: true
- metric: relaxed_human_split
aggregation: mean
higher_is_better: true
- metric: relaxed_augmented_split
aggregation: mean
higher_is_better: true
metadata:
- version: 0.0
model_specific_prompt_kwargs:
default:
pre_prompt: ""
post_prompt: "\nAnswer the question with a single word."
qwen_vl:
pre_prompt: ""
post_prompt: " Answer:"

45 changes: 45 additions & 0 deletions lmms_eval/tasks/coco_cap/coco2017_cap_val_lite.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
dataset_path: lmms-lab/LMMs-Eval-Lite
dataset_kwargs:
token: True
task: "coco2017_cap_val_lite"
dataset_name: coco2017_cap_val
test_split: lite
output_type: generate_until
doc_to_visual: !function utils.coco_doc_to_visual
doc_to_text: !function utils.coco_doc_to_text
doc_to_target: "answer"
generation_kwargs:
max_new_tokens: 64
temperature: 0
top_p: 1.0
num_beams: 1
do_sample: false
process_results: !function utils.coco_process_result
# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
metric_list:
- metric: coco_Bleu_4
aggregation : !function utils.coco_bleu4
higher_is_better : true
- metric: coco_Bleu_3
aggregation : !function utils.coco_bleu3
higher_is_better : true
- metric: coco_Bleu_2
aggregation : !function utils.coco_bleu2
higher_is_better : true
- metric: coco_Bleu_1
aggregation : !function utils.coco_bleu1
higher_is_better : true
- metric: coco_METEOR
aggregation : !function utils.coco_meteor
higher_is_better : true
- metric: coco_ROUGE_L
aggregation : !function utils.coco_rougel
higher_is_better : true
- metric: coco_CIDEr
aggregation : !function utils.coco_cider
higher_is_better : true
#- metric: coco_SPICE
# aggregation : !function utils.coco_spice
# higher_is_better : true
metadata:
- version: 0.0
26 changes: 26 additions & 0 deletions lmms_eval/tasks/docvqa/docvqa_val_lite.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
task: "docvqa_val_lite"
test_split: lite
metric_list:
- metric: anls
aggregation: mean
higher_is_better: true
dataset_path: lmms-lab/LMMs-Eval-Lite
dataset_name: docvqa_val
dataset_kwargs:
token: True
output_type: generate_until
doc_to_visual: !function utils.docvqa_doc_to_visual
doc_to_text: !function utils.docvqa_doc_to_text
doc_to_target: "answers"
generation_kwargs:
max_new_tokens: 32
temperature: 0
do_sample: False
model_specific_prompt_kwargs:
default:
pre_prompt: ""
post_prompt: "\nAnswer the question using a single word or phrase."
qwen_vl:
pre_prompt: ""
post_prompt: " Answer:"

45 changes: 45 additions & 0 deletions lmms_eval/tasks/flickr30k/flickr30k_test_lite.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
dataset_path: lmms-lab/LMMs-Eval-Lite
dataset_kwargs:
token: True
task : "flickr30k_test_lite"
dataset_name: flickr30k_test
test_split: lite
output_type: generate_until
doc_to_visual: !function utils.flickr_doc_to_visual
doc_to_text: !function utils.flickr_doc_to_text
doc_to_target: "answer"
generation_kwargs:
max_new_tokens: 64
temperature: 0
top_p: 1.0
num_beams: 1
do_sample: false
process_results: !function utils.flickr_process_result
# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
metric_list:
- metric: flickr_Bleu_4
aggregation : !function utils.flickr_bleu4
higher_is_better : true
- metric: flickr_Bleu_3
aggregation : !function utils.flickr_bleu3
higher_is_better : true
- metric: flickr_Bleu_2
aggregation : !function utils.flickr_bleu2
higher_is_better : true
- metric: flickr_Bleu_1
aggregation : !function utils.flickr_bleu1
higher_is_better : true
- metric: flickr_METEOR
aggregation : !function utils.flickr_meteor
higher_is_better : true
- metric: flickr_ROUGE_L
aggregation : !function utils.flickr_rougel
higher_is_better : true
- metric: flickr_CIDEr
aggregation : !function utils.flickr_cider
higher_is_better : true
#- metric: flickr_SPICE
# aggregation : !function utils.flickr_spice
# higher_is_better : true
metadata:
- version: 0.0
32 changes: 32 additions & 0 deletions lmms_eval/tasks/gqa/gqa_lite.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
dataset_path: lmms-lab/LMMs-Eval-Lite
dataset_name: gqa
dataset_kwargs:
token: True
task: "gqa_lite"
test_split: lite
output_type: generate_until
doc_to_visual: !function utils.gqa_doc_to_visual
doc_to_text: !function utils.gqa_doc_to_text
doc_to_target: "answer"
generation_kwargs:
max_new_tokens: 16
temperature: 0
top_p: 1.0
num_beams: 1
do_sample: false
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
metadata:
- version: 0.0

model_specific_prompt_kwargs:
default:
pre_prompt: ""
post_prompt: "\nAnswer the question using a single word or phrase."
qwen_vl:
pre_prompt: ""
post_prompt: " Answer:"
22 changes: 22 additions & 0 deletions lmms_eval/tasks/infovqa/infovqa_val_lite.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
task: "infovqa_val_lite"
test_split: lite
output_type: generate_until
metric_list:
- metric: anls
aggregation: mean
higher_is_better: true
dataset_path: lmms-lab/LMMs-Eval-Lite
dataset_name: infovqa_val
dataset_kwargs:
token: True
doc_to_target: "answers"
doc_to_visual: !function utils.infovqa_doc_to_visual
doc_to_text: !function utils.infovqa_doc_to_text
generation_kwargs:
max_new_tokens: 32
temperature: 0
do_sample: False
model_specific_prompt_kwargs:
default:
pre_prompt: ""
post_prompt: "\nAnswer the question using a single word or phrase."
32 changes: 32 additions & 0 deletions lmms_eval/tasks/mmbench/mmbench_cn_dev_lite.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
task: "mmbench_cn_dev_lite"
test_split: "lite"
metric_list:
- metric: gpt_eval_score
aggregation: !function cn_utils.mmbench_aggregate_dev_results_eval
higher_is_better: true
- metric: submission
higher_is_better: true
aggregation: !function cn_utils.mmbench_aggregate_dev_results
dataset_path: lmms-lab/LMMs-Eval-Lite
dataset_name: mmbench_cn_dev
dataset_kwargs:
token: True
doc_to_target: "answer"
output_type: generate_until
doc_to_visual: !function cn_utils.mmbench_doc_to_visual
doc_to_text: !function cn_utils.mmbench_doc_to_text
generation_kwargs:
max_new_tokens: 256
temperature: 0
top_p: 1.0
num_beams: 1
do_sample: false
process_results: !function cn_utils.mmbench_process_results
model_specific_prompt_kwargs:
default:
pre_prompt: ""
post_prompt: "\n请直接使用所提供的选项字母作为答案回答。"
model_specific_generation_kwargs:
llava:
image_aspect_ratio: original

35 changes: 35 additions & 0 deletions lmms_eval/tasks/mmbench/mmbench_en_dev_lite.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
task: "mmbench_en_dev_lite"
test_split: lite
dataset_path: lmms-lab/LMMs-Eval-Lite
dataset_name: mmbench_en_dev
dataset_kwargs:
token: True
doc_to_target: "answer"
model_specific_prompt_kwargs:
default:
pre_prompt: ""
post_prompt: "\nAnswer with the option's letter from the given choices directly."
doc_to_visual: !function en_utils.mmbench_doc_to_visual
doc_to_text: !function en_utils.mmbench_doc_to_text
doc_to_target: "answer"
process_results: !function en_utils.mmbench_process_results
model_specific_generation_kwargs:
llava:
image_aspect_ratio: original
output_type: generate_until
generation_kwargs:
until:
- "ASSISTANT:"
max_new_tokens: 1024
temperature: 0
top_p: 1.0
num_beams: 1
do_sample: false

metric_list:
- metric: gpt_eval_score
aggregation: !function en_utils.mmbench_aggregate_dev_results_eval
higher_is_better: true
- metric: submission
aggregation: !function en_utils.mmbench_aggregate_dev_results_submission
higher_is_better: true
46 changes: 46 additions & 0 deletions lmms_eval/tasks/nocaps/nocaps_val_lite.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
dataset_path: lmms-lab/LMMs-Eval-Lite
dataset_name: nocaps_val
dataset_kwargs:
token: True
task: "nocaps_val_lite"
test_split: lite
output_type: generate_until
doc_to_visual: !function utils.nocaps_doc_to_visual
doc_to_text: !function utils.nocaps_doc_to_text
doc_to_target: "annotations_captions"
generation_kwargs:
max_new_tokens: 64
temperature: 0
top_p: 1.0
num_beams: 1
do_sample: false
process_results: !function utils.nocaps_process_result
# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
metric_list:
- metric: nocaps_Bleu_4
aggregation : !function utils.nocaps_bleu4
higher_is_better : true
- metric: nocaps_Bleu_3
aggregation : !function utils.nocaps_bleu3
higher_is_better : true
- metric: nocaps_Bleu_2
aggregation : !function utils.nocaps_bleu2
higher_is_better : true
- metric: nocaps_Bleu_1
aggregation : !function utils.nocaps_bleu1
higher_is_better : true
- metric: nocaps_METEOR
aggregation : !function utils.nocaps_meteor
higher_is_better : true
- metric: nocaps_ROUGE_L
aggregation : !function utils.nocaps_rougel
higher_is_better : true
- metric: nocaps_CIDEr
aggregation : !function utils.nocaps_cider
higher_is_better : true
#- metric: nocaps_SPICE
# aggregation : !function utils.nocaps_spice
# higher_is_better : true
metadata:
- version: 0.0
include: _default_template_nocaps_yaml
Loading
Loading