diff --git a/README.md b/README.md index 04b62aef..25e8d7dd 100644 --- a/README.md +++ b/README.md @@ -275,6 +275,11 @@ We also provide the raw data exported from Weights & Biases for the detailed res - TextVQA (textvqa) - TextVQA Validation (textvqa_val) - TextVQA Test (textvqa_test) +- VCR-Wiki (vcr_wiki) + - vcr English easy mode (vcr_wiki_en_easy) + - vcr English hard mode (vcr_wiki_en_hard) + - vcr Chinese easy mode (vcr_wiki_zh_easy) + - vcr Chinese hard mode (vcr_wiki_zh_hard) - VizWizVQA (vizwiz_vqa) - VizWizVQA Validation (vizwiz_vqa_val) - VizWizVQA Test (vizwiz_vqa_test) diff --git a/lmms_eval/tasks/vcr/_default_template_vcr_yaml b/lmms_eval/tasks/vcr/_default_template_vcr_yaml deleted file mode 100644 index 8a9095a3..00000000 --- a/lmms_eval/tasks/vcr/_default_template_vcr_yaml +++ /dev/null @@ -1,31 +0,0 @@ -dataset_path: vcr-org/VCR-wiki-en-hard -dataset_kwargs: - token: True -task: "vcr_wiki_en_hard" -test_split: test -output_type: generate_until -doc_to_visual: !function utils.vcr_doc_to_visual -doc_to_text: !function utils.vcr_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 120 - temperature: 0 - top_p: 0 - num_beams: 1 - do_sample: false -# The return value of process_results will be used by metrics -process_results: !function utils.vcr_en_process_results -# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results -metric_list: - - metric: mme_percetion_score - aggregation: !function utils.vcr_en_process_results - higher_is_better: true - - metric: mme_cognition_score - aggregation: !function utils.vcr_en_process_results - higher_is_better: true -model_specific_prompt_kwargs: - default: - pre_prompt: "" - post_prompt: "What is the covered texts in the image? Please restore the covered texts without outputting the explanations." -metadata: - - version: 0.0.1 diff --git a/lmms_eval/tasks/vcr/vcr_wiki_en_easy.yaml b/lmms_eval/tasks/vcr/vcr_wiki_en_easy.yaml deleted file mode 100644 index 1aa2745d..00000000 --- a/lmms_eval/tasks/vcr/vcr_wiki_en_easy.yaml +++ /dev/null @@ -1,31 +0,0 @@ -dataset_path: vcr-org/VCR-wiki-en-easy -dataset_kwargs: - token: True -task: "vcr_wiki_en_easy" -test_split: test -output_type: generate_until -doc_to_visual: !function utils.vcr_doc_to_visual -doc_to_text: !function utils.vcr_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 120 - temperature: 0 - top_p: 0 - num_beams: 1 - do_sample: false -# The return value of process_results will be used by metrics -process_results: !function utils.vcr_en_process_results -# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results -metric_list: - - metric: mme_percetion_score - aggregation: !function utils.vcr_en_process_results - higher_is_better: true - - metric: mme_cognition_score - aggregation: !function utils.vcr_en_process_results - higher_is_better: true -model_specific_prompt_kwargs: - default: - pre_prompt: "" - post_prompt: "What is the covered texts in the image? Please restore the covered texts without outputting the explanations." -metadata: - - version: 0.0.1 diff --git a/lmms_eval/tasks/vcr/vcr_wiki_en_hard.yaml b/lmms_eval/tasks/vcr/vcr_wiki_en_hard.yaml deleted file mode 100644 index 8a9095a3..00000000 --- a/lmms_eval/tasks/vcr/vcr_wiki_en_hard.yaml +++ /dev/null @@ -1,31 +0,0 @@ -dataset_path: vcr-org/VCR-wiki-en-hard -dataset_kwargs: - token: True -task: "vcr_wiki_en_hard" -test_split: test -output_type: generate_until -doc_to_visual: !function utils.vcr_doc_to_visual -doc_to_text: !function utils.vcr_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 120 - temperature: 0 - top_p: 0 - num_beams: 1 - do_sample: false -# The return value of process_results will be used by metrics -process_results: !function utils.vcr_en_process_results -# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results -metric_list: - - metric: mme_percetion_score - aggregation: !function utils.vcr_en_process_results - higher_is_better: true - - metric: mme_cognition_score - aggregation: !function utils.vcr_en_process_results - higher_is_better: true -model_specific_prompt_kwargs: - default: - pre_prompt: "" - post_prompt: "What is the covered texts in the image? Please restore the covered texts without outputting the explanations." -metadata: - - version: 0.0.1 diff --git a/lmms_eval/tasks/vcr/vcr_wiki_zh_easy.yaml b/lmms_eval/tasks/vcr/vcr_wiki_zh_easy.yaml deleted file mode 100644 index ba888daf..00000000 --- a/lmms_eval/tasks/vcr/vcr_wiki_zh_easy.yaml +++ /dev/null @@ -1,31 +0,0 @@ -dataset_path: vcr-org/VCR-wiki-zh-easy -dataset_kwargs: - token: True -task: "vcr_wiki_zh_easy" -test_split: test -output_type: generate_until -doc_to_visual: !function utils.vcr_doc_to_visual -doc_to_text: !function utils.vcr_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 120 - temperature: 0 - top_p: 0 - num_beams: 1 - do_sample: false -# The return value of process_results will be used by metrics -process_results: !function utils.vcr_zh_process_results -# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results -metric_list: - - metric: mme_percetion_score - aggregation: !function utils.vcr_zh_process_results - higher_is_better: true - - metric: mme_cognition_score - aggregation: !function utils.vcr_zh_process_results - higher_is_better: true -model_specific_prompt_kwargs: - default: - pre_prompt: "" - post_prompt: "图像中被覆盖的文本是什么?请在不输出解释的情况下还原被覆盖的文本。" -metadata: - - version: 0.0.1 diff --git a/lmms_eval/tasks/vcr/vcr_wiki_zh_hard.yaml b/lmms_eval/tasks/vcr/vcr_wiki_zh_hard.yaml deleted file mode 100644 index b6dc0f7f..00000000 --- a/lmms_eval/tasks/vcr/vcr_wiki_zh_hard.yaml +++ /dev/null @@ -1,31 +0,0 @@ -dataset_path: vcr-org/VCR-wiki-zh-hard -dataset_kwargs: - token: True -task: "vcr_wiki_zh_hard" -test_split: test -output_type: generate_until -doc_to_visual: !function utils.vcr_doc_to_visual -doc_to_text: !function utils.vcr_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 120 - temperature: 0 - top_p: 0 - num_beams: 1 - do_sample: false -# The return value of process_results will be used by metrics -process_results: !function utils.vcr_zh_process_results -# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results -metric_list: - - metric: mme_percetion_score - aggregation: !function utils.vcr_zh_process_results - higher_is_better: true - - metric: mme_cognition_score - aggregation: !function utils.vcr_zh_process_results - higher_is_better: true -model_specific_prompt_kwargs: - default: - pre_prompt: "" - post_prompt: "图像中被覆盖的文本是什么?请在不输出解释的情况下还原被覆盖的文本。" -metadata: - - version: 0.0.1 diff --git a/lmms_eval/tasks/vcr_wiki/_default_template_vcr_yaml b/lmms_eval/tasks/vcr_wiki/_default_template_vcr_yaml new file mode 100644 index 00000000..37ab5e74 --- /dev/null +++ b/lmms_eval/tasks/vcr_wiki/_default_template_vcr_yaml @@ -0,0 +1,17 @@ + +dataset_kwargs: + token: True +output_type: generate_until +doc_to_visual: !function utils.vcr_doc_to_visual +doc_to_text: !function utils.vcr_doc_to_text +doc_to_target: "answer" +generation_kwargs: + max_new_tokens: 120 + temperature: 0 + top_p: 0 + num_beams: 1 + do_sample: false +# The return value of process_results will be used by metrics +# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results +metadata: + - version: 0.0.1 \ No newline at end of file diff --git a/lmms_eval/tasks/vcr/utils.py b/lmms_eval/tasks/vcr_wiki/utils.py similarity index 63% rename from lmms_eval/tasks/vcr/utils.py rename to lmms_eval/tasks/vcr_wiki/utils.py index 0079c8fd..8f3aca04 100644 --- a/lmms_eval/tasks/vcr/utils.py +++ b/lmms_eval/tasks/vcr_wiki/utils.py @@ -1,9 +1,5 @@ -from collections import defaultdict import os from difflib import SequenceMatcher as SM -import datetime -import json -from lmms_eval.tasks._task_utils.file_utils import generate_submission_file import evaluate import logging import spacy @@ -34,6 +30,21 @@ } +def fast_filter(answer_text): + if "I can't" in answer_text: + return True + elif "I cannot" in answer_text: + return True + elif "sorry" in answer_text.lower(): + return True + if "无法" in answer_text: + return True + elif "抱歉" in answer_text: + return True + else: + return False + + def vcr_doc_to_visual(doc): return [doc["stacked_image"].convert("RGB"), doc["only_it_image"].convert("RGB")] @@ -63,7 +74,7 @@ def tokenize(text, language): return [token.text for token in processed_text] -def vcr_process_results_single(doc, result, language): +def vcr_process_results_single(crossed_text, result, language): """ Args: doc: a instance of the eval dataset @@ -71,8 +82,21 @@ def vcr_process_results_single(doc, result, language): Returns: a dictionary with key: metric name (in this case mme score), value: metric value """ + assert language in ["en", "zh"], f"Language {language} is not supported." - crossed_text = doc["crossed_text"] + + if fast_filter(result): + return { + "crossed_text": crossed_text, + "max_sim_val": 0, + "max_sim_string": "", + "precision": 0, + "recall": 0, + "f1": 0, + "jaccard": 0, + "rouge1": 0, + "exact_match": 0, + } tokens_result = tokenize(result, language) tokens_crossed_text = tokenize(crossed_text, language) @@ -150,10 +174,26 @@ def vcr_en_process_results(doc, results): a dictionary with key: metric name (in this case mme score), value: metric value """ assert len(results) == 2, f"Expected 2 results, got {len(results)}" - output = { - "res_stacked_image": vcr_process_results_single(doc, results[0], "en"), - "res_only_it_image": vcr_process_results_single(doc, results[1], "en"), - } + output = {} + for i in range(len(doc["crossed_text"])): + res_stacked_image_results = vcr_process_results_single( + doc["crossed_text"][i], results[0], "en" + ) + res_only_image_results = vcr_process_results_single( + doc["crossed_text"][i], results[1], "en" + ) + output.update( + { + f"res_stacked_image__{k}___{i}": v + for k, v in res_stacked_image_results.items() + } + ) + output.update( + { + f"res_only_it_image__{k}___{i}": v + for k, v in res_only_image_results.items() + } + ) return output @@ -166,10 +206,26 @@ def vcr_zh_process_results(doc, results): a dictionary with key: metric name (in this case mme score), value: metric value """ assert len(results) == 2, f"Expected 2 results, got {len(results)}" - output = { - "res_stacked_image": vcr_process_results_single(doc, results[0], "zh"), - "res_only_it_image": vcr_process_results_single(doc, results[1], "zh"), - } + output = {} + for i in range(len(doc["crossed_text"])): + res_stacked_image_results = vcr_process_results_single( + doc["crossed_text"][i], results[0], "zh" + ) + res_only_image_results = vcr_process_results_single( + doc["crossed_text"][i], results[1], "zh" + ) + output.update( + { + f"res_stacked_image__{k}___{i}": v + for k, v in res_stacked_image_results.items() + } + ) + output.update( + { + f"res_only_it_image__{k}___{i}": v + for k, v in res_only_image_results.items() + } + ) return output @@ -180,36 +236,29 @@ def vcr_aggregate_results(results): Returns: A dictionary of dictionary of float, where the outer dictionary has keys "res_stacked_image" and "res_only_it_image" """ - output = { - "res_stacked_image": { - "max_sim_val": 0, - "precision": 0, - "recall": 0, - "f1": 0, - "jaccard": 0, - "rouge1": 0, - }, - "res_only_it_image": { - "max_sim_val": 0, - "precision": 0, - "recall": 0, - "f1": 0, - "jaccard": 0, - "rouge1": 0, - }, + "res_stacked_image__precision": 0, + "res_stacked_image__recall": 0, + "res_stacked_image__f1": 0, + "res_stacked_image__jaccard": 0, + "res_stacked_image__rouge1": 0, + "res_stacked_image__exact_match": 0, + "res_only_it_image__precision": 0, + "res_only_it_image__recall": 0, + "res_only_it_image__f1": 0, + "res_only_it_image__jaccard": 0, + "res_only_it_image__rouge1": 0, + "res_only_it_image__exact_match": 0, } - for target_domain in output.keys(): - for target_metric_name in output[target_domain].keys(): - score = 0 - count = 0 - for inner_dict in results: - for inner_key, inner_value in inner_dict.items(): - if inner_key == target_domain: - for blank_id, blank_metrics in inner_value.items(): - for metric_name, metric_value in blank_metrics.items(): - if metric_name == target_metric_name: - score += metric_value - count += 1 - output[target_domain][target_metric_name] = score / count + + for output_key in output.keys(): + count = 0 + query_domain, query_metric_name = output_key.split("__") + for inner_dict in results: + for inner_key, inner_value in inner_dict.items(): + key_domain, key_metric_name, _ = inner_key.split("__") + if query_domain == key_domain and query_metric_name == key_metric_name: + output[output_key] += inner_value + count += 1 + output[output_key] /= count return output diff --git a/lmms_eval/tasks/vcr_wiki/vcr_wiki_en_easy_100.yaml b/lmms_eval/tasks/vcr_wiki/vcr_wiki_en_easy_100.yaml new file mode 100644 index 00000000..7df8eb38 --- /dev/null +++ b/lmms_eval/tasks/vcr_wiki/vcr_wiki_en_easy_100.yaml @@ -0,0 +1,16 @@ +"include": "_default_template_vcr_yaml" +dataset_path: vcr-org/VCR-wiki-en-easy-test +task: "vcr_wiki_en_easy" +test_split: train[:100] +process_results: !function utils.vcr_en_process_results +metric_list: + - metric: vcr_percetion_score + aggregation: !function utils.vcr_en_process_results + higher_is_better: true + - metric: vcr_cognition_score + aggregation: !function utils.vcr_en_process_results + higher_is_better: true +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "What is the covered texts in the image? Please restore the covered texts without outputting the explanations." \ No newline at end of file diff --git a/lmms_eval/tasks/vcr_wiki/vcr_wiki_en_easy_500.yaml b/lmms_eval/tasks/vcr_wiki/vcr_wiki_en_easy_500.yaml new file mode 100644 index 00000000..9079d2cf --- /dev/null +++ b/lmms_eval/tasks/vcr_wiki/vcr_wiki_en_easy_500.yaml @@ -0,0 +1,16 @@ +"include": "_default_template_vcr_yaml" +dataset_path: vcr-org/VCR-wiki-en-easy-test +task: "vcr_wiki_en_easy" +test_split: train[:500] +process_results: !function utils.vcr_en_process_results +metric_list: + - metric: vcr_percetion_score + aggregation: !function utils.vcr_en_process_results + higher_is_better: true + - metric: vcr_cognition_score + aggregation: !function utils.vcr_en_process_results + higher_is_better: true +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "What is the covered texts in the image? Please restore the covered texts without outputting the explanations." \ No newline at end of file diff --git a/lmms_eval/tasks/vcr_wiki/vcr_wiki_en_easy_5000.yaml b/lmms_eval/tasks/vcr_wiki/vcr_wiki_en_easy_5000.yaml new file mode 100644 index 00000000..7a5b09e2 --- /dev/null +++ b/lmms_eval/tasks/vcr_wiki/vcr_wiki_en_easy_5000.yaml @@ -0,0 +1,16 @@ +"include": "_default_template_vcr_yaml" +dataset_path: vcr-org/VCR-wiki-en-easy-test +task: "vcr_wiki_en_easy" +test_split: train +process_results: !function utils.vcr_en_process_results +metric_list: + - metric: vcr_percetion_score + aggregation: !function utils.vcr_en_process_results + higher_is_better: true + - metric: vcr_cognition_score + aggregation: !function utils.vcr_en_process_results + higher_is_better: true +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "What is the covered texts in the image? Please restore the covered texts without outputting the explanations." \ No newline at end of file diff --git a/lmms_eval/tasks/vcr_wiki/vcr_wiki_en_hard_100.yaml b/lmms_eval/tasks/vcr_wiki/vcr_wiki_en_hard_100.yaml new file mode 100644 index 00000000..3f24be85 --- /dev/null +++ b/lmms_eval/tasks/vcr_wiki/vcr_wiki_en_hard_100.yaml @@ -0,0 +1,16 @@ +"include": "_default_template_vcr_yaml" +dataset_path: vcr-org/VCR-wiki-en-hard-test +task: "vcr_wiki_en_hard" +test_split: train[:100] +process_results: !function utils.vcr_en_process_results +metric_list: + - metric: vcr_percetion_score + aggregation: !function utils.vcr_en_process_results + higher_is_better: true + - metric: vcr_cognition_score + aggregation: !function utils.vcr_en_process_results + higher_is_better: true +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "What is the covered texts in the image? Please restore the covered texts without outputting the explanations." \ No newline at end of file diff --git a/lmms_eval/tasks/vcr_wiki/vcr_wiki_en_hard_500.yaml b/lmms_eval/tasks/vcr_wiki/vcr_wiki_en_hard_500.yaml new file mode 100644 index 00000000..c88c91fe --- /dev/null +++ b/lmms_eval/tasks/vcr_wiki/vcr_wiki_en_hard_500.yaml @@ -0,0 +1,16 @@ +"include": "_default_template_vcr_yaml" +dataset_path: vcr-org/VCR-wiki-en-hard-test +task: "vcr_wiki_en_hard" +test_split: train[:500] +process_results: !function utils.vcr_en_process_results +metric_list: + - metric: vcr_percetion_score + aggregation: !function utils.vcr_en_process_results + higher_is_better: true + - metric: vcr_cognition_score + aggregation: !function utils.vcr_en_process_results + higher_is_better: true +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "What is the covered texts in the image? Please restore the covered texts without outputting the explanations." \ No newline at end of file diff --git a/lmms_eval/tasks/vcr_wiki/vcr_wiki_en_hard_5000.yaml b/lmms_eval/tasks/vcr_wiki/vcr_wiki_en_hard_5000.yaml new file mode 100644 index 00000000..aa5e3634 --- /dev/null +++ b/lmms_eval/tasks/vcr_wiki/vcr_wiki_en_hard_5000.yaml @@ -0,0 +1,16 @@ +"include": "_default_template_vcr_yaml" +dataset_path: vcr-org/VCR-wiki-en-hard-test +task: "vcr_wiki_en_hard" +test_split: train +process_results: !function utils.vcr_en_process_results +metric_list: + - metric: vcr_percetion_score + aggregation: !function utils.vcr_en_process_results + higher_is_better: true + - metric: vcr_cognition_score + aggregation: !function utils.vcr_en_process_results + higher_is_better: true +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "What is the covered texts in the image? Please restore the covered texts without outputting the explanations." \ No newline at end of file diff --git a/lmms_eval/tasks/vcr_wiki/vcr_wiki_zh_easy_100.yaml b/lmms_eval/tasks/vcr_wiki/vcr_wiki_zh_easy_100.yaml new file mode 100644 index 00000000..9ce02a93 --- /dev/null +++ b/lmms_eval/tasks/vcr_wiki/vcr_wiki_zh_easy_100.yaml @@ -0,0 +1,16 @@ +"include": "_default_template_vcr_yaml" +dataset_path: vcr-org/VCR-wiki-zh-easy-test +task: "vcr_wiki_zh_easy" +test_split: train[:100] +process_results: !function utils.vcr_zh_process_results +metric_list: + - metric: vcr_percetion_score + aggregation: !function utils.vcr_zh_process_results + higher_is_better: true + - metric: vcr_cognition_score + aggregation: !function utils.vcr_zh_process_results + higher_is_better: true +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "图像中被覆盖的文本是什么?请在不输出解释的情况下还原被覆盖的文本。" \ No newline at end of file diff --git a/lmms_eval/tasks/vcr_wiki/vcr_wiki_zh_easy_500.yaml b/lmms_eval/tasks/vcr_wiki/vcr_wiki_zh_easy_500.yaml new file mode 100644 index 00000000..da210bb8 --- /dev/null +++ b/lmms_eval/tasks/vcr_wiki/vcr_wiki_zh_easy_500.yaml @@ -0,0 +1,16 @@ +"include": "_default_template_vcr_yaml" +dataset_path: vcr-org/VCR-wiki-zh-easy-test +task: "vcr_wiki_zh_easy" +test_split: train[:500] +process_results: !function utils.vcr_zh_process_results +metric_list: + - metric: vcr_percetion_score + aggregation: !function utils.vcr_zh_process_results + higher_is_better: true + - metric: vcr_cognition_score + aggregation: !function utils.vcr_zh_process_results + higher_is_better: true +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "图像中被覆盖的文本是什么?请在不输出解释的情况下还原被覆盖的文本。" \ No newline at end of file diff --git a/lmms_eval/tasks/vcr_wiki/vcr_wiki_zh_easy_5000.yaml b/lmms_eval/tasks/vcr_wiki/vcr_wiki_zh_easy_5000.yaml new file mode 100644 index 00000000..8807db4a --- /dev/null +++ b/lmms_eval/tasks/vcr_wiki/vcr_wiki_zh_easy_5000.yaml @@ -0,0 +1,16 @@ +"include": "_default_template_vcr_yaml" +dataset_path: vcr-org/VCR-wiki-zh-easy-test +task: "vcr_wiki_zh_easy" +test_split: train +process_results: !function utils.vcr_zh_process_results +metric_list: + - metric: vcr_percetion_score + aggregation: !function utils.vcr_zh_process_results + higher_is_better: true + - metric: vcr_cognition_score + aggregation: !function utils.vcr_zh_process_results + higher_is_better: true +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "图像中被覆盖的文本是什么?请在不输出解释的情况下还原被覆盖的文本。" \ No newline at end of file diff --git a/lmms_eval/tasks/vcr_wiki/vcr_wiki_zh_hard_100.yaml b/lmms_eval/tasks/vcr_wiki/vcr_wiki_zh_hard_100.yaml new file mode 100644 index 00000000..c15c8059 --- /dev/null +++ b/lmms_eval/tasks/vcr_wiki/vcr_wiki_zh_hard_100.yaml @@ -0,0 +1,16 @@ +"include": "_default_template_vcr_yaml" +dataset_path: vcr-org/VCR-wiki-zh-hard-test +task: "vcr_wiki_zh_hard" +test_split: train[:100] +process_results: !function utils.vcr_zh_process_results +metric_list: + - metric: vcr_percetion_score + aggregation: !function utils.vcr_zh_process_results + higher_is_better: true + - metric: vcr_cognition_score + aggregation: !function utils.vcr_zh_process_results + higher_is_better: true +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "图像中被覆盖的文本是什么?请在不输出解释的情况下还原被覆盖的文本。" \ No newline at end of file diff --git a/lmms_eval/tasks/vcr_wiki/vcr_wiki_zh_hard_500.yaml b/lmms_eval/tasks/vcr_wiki/vcr_wiki_zh_hard_500.yaml new file mode 100644 index 00000000..90bba34e --- /dev/null +++ b/lmms_eval/tasks/vcr_wiki/vcr_wiki_zh_hard_500.yaml @@ -0,0 +1,16 @@ +"include": "_default_template_vcr_yaml" +dataset_path: vcr-org/VCR-wiki-zh-hard-test +task: "vcr_wiki_zh_hard" +test_split: train[:500] +process_results: !function utils.vcr_zh_process_results +metric_list: + - metric: vcr_percetion_score + aggregation: !function utils.vcr_zh_process_results + higher_is_better: true + - metric: vcr_cognition_score + aggregation: !function utils.vcr_zh_process_results + higher_is_better: true +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "图像中被覆盖的文本是什么?请在不输出解释的情况下还原被覆盖的文本。" \ No newline at end of file diff --git a/lmms_eval/tasks/vcr_wiki/vcr_wiki_zh_hard_5000.yaml b/lmms_eval/tasks/vcr_wiki/vcr_wiki_zh_hard_5000.yaml new file mode 100644 index 00000000..48893f73 --- /dev/null +++ b/lmms_eval/tasks/vcr_wiki/vcr_wiki_zh_hard_5000.yaml @@ -0,0 +1,16 @@ +"include": "_default_template_vcr_yaml" +dataset_path: vcr-org/VCR-wiki-zh-hard-test +task: "vcr_wiki_zh_hard" +test_split: train +process_results: !function utils.vcr_zh_process_results +metric_list: + - metric: vcr_percetion_score + aggregation: !function utils.vcr_zh_process_results + higher_is_better: true + - metric: vcr_cognition_score + aggregation: !function utils.vcr_zh_process_results + higher_is_better: true +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "图像中被覆盖的文本是什么?请在不输出解释的情况下还原被覆盖的文本。" \ No newline at end of file