Skip to content

Commit

Permalink
update aggregation function for vcr_wiki
Browse files Browse the repository at this point in the history
  • Loading branch information
sheryc committed Jun 12, 2024
1 parent 326b969 commit 47b13b9
Show file tree
Hide file tree
Showing 13 changed files with 36 additions and 27 deletions.
15 changes: 12 additions & 3 deletions lmms_eval/tasks/vcr_wiki/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@
from functools import partial

import evaluate
import numpy as np
import spacy
from nltk.util import ngrams
from spacy.cli import download
import numpy as np

from lmms_eval.tasks._task_utils.file_utils import generate_submission_file

# Download the English and Chinese models
Expand Down Expand Up @@ -262,7 +263,7 @@ def bootstrap_std(data, n_bootstrap=1000, ci=0.95):
return std, lower_bound, upper_bound


def vcr_aggregate_results(results, args):
def vcr_aggregate_results(results, args, metric='exact_match'):
"""
Args:
results: List[List[Dict]], list of results returned by process_results
Expand All @@ -285,9 +286,17 @@ def vcr_aggregate_results(results, args):
"detailed_results": output_dict_detail_result,
}
now_date_time = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
path = generate_submission_file(f"vcr_submission_{now_date_time}.json", args)
path = generate_submission_file(f"vcr_submission_{metric}_{now_date_time}.json", args)
with open(path, "w", encoding="utf-8") as f:
json.dump(output_dict, f, indent=4, ensure_ascii=False)
# print(f"Submission file saved to {path}")
eval_logger.info(f"Submission file saved to {path}")
return mean_score


def vcr_aggregate_exact_match(results, args):
return vcr_aggregate_results(results, args, metric='exact_match')


def vcr_aggregate_jaccard(results, args):
return vcr_aggregate_results(results, args, metric='jaccard')
4 changes: 2 additions & 2 deletions lmms_eval/tasks/vcr_wiki/vcr_wiki_en_easy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ test_split: test
process_results: !function utils.vcr_en_process_results
metric_list:
- metric: jaccard
aggregation: !function utils.vcr_aggregate_results
aggregation: !function utils.vcr_aggregate_jaccard
higher_is_better: true
- metric: exact_match
aggregation: !function utils.vcr_aggregate_results
aggregation: !function utils.vcr_aggregate_exact_match
higher_is_better: true
model_specific_prompt_kwargs:
default:
Expand Down
4 changes: 2 additions & 2 deletions lmms_eval/tasks/vcr_wiki/vcr_wiki_en_easy_100.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ test_split: test
process_results: !function utils.vcr_en_process_results
metric_list:
- metric: jaccard
aggregation: !function utils.vcr_aggregate_results
aggregation: !function utils.vcr_aggregate_jaccard
higher_is_better: true
- metric: exact_match
aggregation: !function utils.vcr_aggregate_results
aggregation: !function utils.vcr_aggregate_exact_match
higher_is_better: true
model_specific_prompt_kwargs:
default:
Expand Down
4 changes: 2 additions & 2 deletions lmms_eval/tasks/vcr_wiki/vcr_wiki_en_easy_500.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ test_split: test
process_results: !function utils.vcr_en_process_results
metric_list:
- metric: jaccard
aggregation: !function utils.vcr_aggregate_results
aggregation: !function utils.vcr_aggregate_jaccard
higher_is_better: true
- metric: exact_match
aggregation: !function utils.vcr_aggregate_results
aggregation: !function utils.vcr_aggregate_exact_match
higher_is_better: true
model_specific_prompt_kwargs:
default:
Expand Down
4 changes: 2 additions & 2 deletions lmms_eval/tasks/vcr_wiki/vcr_wiki_en_hard.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ test_split: test
process_results: !function utils.vcr_en_process_results
metric_list:
- metric: jaccard
aggregation: !function utils.vcr_aggregate_results
aggregation: !function utils.vcr_aggregate_jaccard
higher_is_better: true
- metric: exact_match
aggregation: !function utils.vcr_aggregate_results
aggregation: !function utils.vcr_aggregate_exact_match
higher_is_better: true
model_specific_prompt_kwargs:
default:
Expand Down
4 changes: 2 additions & 2 deletions lmms_eval/tasks/vcr_wiki/vcr_wiki_en_hard_100.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ test_split: test
process_results: !function utils.vcr_en_process_results
metric_list:
- metric: jaccard
aggregation: !function utils.vcr_aggregate_results
aggregation: !function utils.vcr_aggregate_jaccard
higher_is_better: true
- metric: exact_match
aggregation: !function utils.vcr_aggregate_results
aggregation: !function utils.vcr_aggregate_exact_match
higher_is_better: true
model_specific_prompt_kwargs:
default:
Expand Down
4 changes: 2 additions & 2 deletions lmms_eval/tasks/vcr_wiki/vcr_wiki_en_hard_500.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ test_split: test
process_results: !function utils.vcr_en_process_results
metric_list:
- metric: jaccard
aggregation: !function utils.vcr_aggregate_results
aggregation: !function utils.vcr_aggregate_jaccard
higher_is_better: true
- metric: exact_match
aggregation: !function utils.vcr_aggregate_results
aggregation: !function utils.vcr_aggregate_exact_match
higher_is_better: true
model_specific_prompt_kwargs:
default:
Expand Down
4 changes: 2 additions & 2 deletions lmms_eval/tasks/vcr_wiki/vcr_wiki_zh_easy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ test_split: test
process_results: !function utils.vcr_zh_process_results
metric_list:
- metric: jaccard
aggregation: !function utils.vcr_aggregate_results
aggregation: !function utils.vcr_aggregate_jaccard
higher_is_better: true
- metric: exact_match
aggregation: !function utils.vcr_aggregate_results
aggregation: !function utils.vcr_aggregate_exact_match
higher_is_better: true
model_specific_prompt_kwargs:
default:
Expand Down
4 changes: 2 additions & 2 deletions lmms_eval/tasks/vcr_wiki/vcr_wiki_zh_easy_100.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ test_split: test
process_results: !function utils.vcr_zh_process_results
metric_list:
- metric: jaccard
aggregation: !function utils.vcr_aggregate_results
aggregation: !function utils.vcr_aggregate_jaccard
higher_is_better: true
- metric: exact_match
aggregation: !function utils.vcr_aggregate_results
aggregation: !function utils.vcr_aggregate_exact_match
higher_is_better: true
model_specific_prompt_kwargs:
default:
Expand Down
4 changes: 2 additions & 2 deletions lmms_eval/tasks/vcr_wiki/vcr_wiki_zh_easy_500.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ test_split: test
process_results: !function utils.vcr_zh_process_results
metric_list:
- metric: jaccard
aggregation: !function utils.vcr_aggregate_results
aggregation: !function utils.vcr_aggregate_jaccard
higher_is_better: true
- metric: exact_match
aggregation: !function utils.vcr_aggregate_results
aggregation: !function utils.vcr_aggregate_exact_match
higher_is_better: true
model_specific_prompt_kwargs:
default:
Expand Down
4 changes: 2 additions & 2 deletions lmms_eval/tasks/vcr_wiki/vcr_wiki_zh_hard.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ test_split: test
process_results: !function utils.vcr_zh_process_results
metric_list:
- metric: jaccard
aggregation: !function utils.vcr_aggregate_results
aggregation: !function utils.vcr_aggregate_jaccard
higher_is_better: true
- metric: exact_match
aggregation: !function utils.vcr_aggregate_results
aggregation: !function utils.vcr_aggregate_exact_match
higher_is_better: true
model_specific_prompt_kwargs:
default:
Expand Down
4 changes: 2 additions & 2 deletions lmms_eval/tasks/vcr_wiki/vcr_wiki_zh_hard_100.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ test_split: test
process_results: !function utils.vcr_zh_process_results
metric_list:
- metric: jaccard
aggregation: !function utils.vcr_aggregate_results
aggregation: !function utils.vcr_aggregate_jaccard
higher_is_better: true
- metric: exact_match
aggregation: !function utils.vcr_aggregate_results
aggregation: !function utils.vcr_aggregate_exact_match
higher_is_better: true
model_specific_prompt_kwargs:
default:
Expand Down
4 changes: 2 additions & 2 deletions lmms_eval/tasks/vcr_wiki/vcr_wiki_zh_hard_500.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ test_split: test
process_results: !function utils.vcr_zh_process_results
metric_list:
- metric: jaccard
aggregation: !function utils.vcr_aggregate_results
aggregation: !function utils.vcr_aggregate_jaccard
higher_is_better: true
- metric: exact_match
aggregation: !function utils.vcr_aggregate_results
aggregation: !function utils.vcr_aggregate_exact_match
higher_is_better: true
model_specific_prompt_kwargs:
default:
Expand Down

0 comments on commit 47b13b9

Please sign in to comment.