diff --git a/lmms_eval/api/task.py b/lmms_eval/api/task.py index c38e6289..2700427f 100644 --- a/lmms_eval/api/task.py +++ b/lmms_eval/api/task.py @@ -93,6 +93,7 @@ class TaskConfig(dict): model_specific_prompt_kwargs: dict = None model_specific_generation_kwargs: dict = None + model_specific_target_kwargs: dict = None def __post_init__(self) -> None: if self.dataset_path and os.path.exists(os.path.dirname(self.dataset_path)): @@ -347,7 +348,7 @@ def build_all_requests(self, limit=None, rank=None, world_size=None) -> None: doc_id_iterator = utils.create_iterator([i for i in range(len(docs))], rank, world_size, limit) doc_id_iterator, doc_id_iterator_counting = itertools.tee(doc_id_iterator) total_docs = sum(1 for _ in doc_id_iterator_counting) - pbar = tqdm(total=total_docs, desc=f"Building context {rank}", position=rank) + pbar = tqdm(total=total_docs, desc=f"Building context", disable=(rank != 0)) for doc_id in doc_id_iterator: # sample fewshot context #TODO: need to offset doc_id by rank now! fewshot_ctx = self.fewshot_context(doc_id, 0 if self.config.num_fewshot is None else self.config.num_fewshot, self.config.training_split if self.has_training_docs() else split) @@ -594,14 +595,20 @@ def _prepare_model_specific_config(self): if self.model_name in self.model_specific_prompt_kwargs: self.model_specific_prompt_kwargs = self.model_specific_prompt_kwargs[self.model_name] else: - self.model_specific_prompt_kwargs = self.model_specific_prompt_kwargs["default"] + self.model_specific_prompt_kwargs = self.model_specific_prompt_kwargs.get("default", None) + self.model_specific_target_kwargs = self.config.model_specific_target_kwargs + if self.model_specific_target_kwargs is not None: + if self.model_name in self.model_specific_target_kwargs: + self.model_specific_target_kwargs = self.model_specific_target_kwargs[self.model_name] + else: + self.model_specific_target_kwargs = self.model_specific_target_kwargs["default"].get("default", None) self.model_specific_generation_kwargs = self.config.model_specific_generation_kwargs if self.model_specific_generation_kwargs is not None: if self.model_name in self.model_specific_generation_kwargs: self.model_specific_generation_kwargs = self.model_specific_generation_kwargs[self.model_name] else: - self.model_specific_generation_kwargs = self.model_specific_generation_kwargs["default"] + self.model_specific_generation_kwargs = self.model_specific_generation_kwargs.get("default", {}) self.config.generation_kwargs.update(self.model_specific_generation_kwargs) @@ -839,7 +846,7 @@ def doc_to_target(self, doc: dict) -> Union[int, str, list]: elif type(doc_to_target) == list: return doc_to_target elif callable(doc_to_target): - return doc_to_target(doc) + return doc_to_target(doc, self.model_specific_target_kwargs) if self.model_specific_target_kwargs is not None else doc_to_target(doc) # Used when applying a Promptsource template elif hasattr(doc_to_target, "apply"): applied_prompt = doc_to_target.apply(doc) diff --git a/lmms_eval/evaluator.py b/lmms_eval/evaluator.py index b4cb94d0..4af44131 100644 --- a/lmms_eval/evaluator.py +++ b/lmms_eval/evaluator.py @@ -319,7 +319,7 @@ def evaluate( # Don't use above one, this would crash if doc_iterator_for_counting contains too many objects and very slow doc_iterator_for_counting = itertools.islice(range(len(task.test_docs())), lm.rank, limit, lm.world_size) if task.has_test_docs() else itertools.islice(range(len(task.validation_docs())), lm.rank, limit, lm.world_size) total_docs = sum(1 for _ in doc_iterator_for_counting) - pbar = tqdm(total=total_docs, desc=f"Postprocessing {lm.rank}", position=lm.rank) + pbar = tqdm(total=total_docs, desc=f"Postprocessing", disable=(lm.rank != 0)) for doc_id, doc in doc_iterator: # subset instances to only this document id ; sort by idx requests = list(filter(lambda x: x.doc_id == doc_id, task.instances)) @@ -427,12 +427,14 @@ def evaluate( else: group_name = None agg_fn = task.aggregation()[metric] - # Bo: for models only need agg items - if inspect.getfullargspec(agg_fn).args == ["results"]: - results[task_name][metric_key] = agg_fn(items) + # Bo: for models that need to know the args to save to correct path - elif inspect.getfullargspec(agg_fn).args == ["results", "args"]: + if inspect.getfullargspec(agg_fn).args == ["results", "args"]: results[task_name][metric_key] = agg_fn(items, cli_args) + else: + # Bo: for models only need agg items + results[task_name][metric_key] = agg_fn(items) + results[task_name]["samples"] = len(items) # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap diff --git a/lmms_eval/models/qwen_vl.py b/lmms_eval/models/qwen_vl.py index a74939b3..03edb9c5 100644 --- a/lmms_eval/models/qwen_vl.py +++ b/lmms_eval/models/qwen_vl.py @@ -42,7 +42,7 @@ def __init__( self._tokenizer = AutoTokenizer.from_pretrained(pretrained, trust_remote_code=trust_remote_code) self.tokenizer.padding_side = "left" self.tokenizer.pad_token_id = self.tokenizer.eod_id - self.prompt = "{}{} Answer:" + self.prompt = "{}{}" self._config = self._model.config self.model.tie_weights() self.batch_size_per_gpu = int(batch_size) diff --git a/lmms_eval/tasks/ai2d/ai2d.yaml b/lmms_eval/tasks/ai2d/ai2d.yaml index d8871ae0..e032a3c4 100644 --- a/lmms_eval/tasks/ai2d/ai2d.yaml +++ b/lmms_eval/tasks/ai2d/ai2d.yaml @@ -17,12 +17,20 @@ metric_list: higher_is_better: true ignore_case: true ignore_punctuation: true -process_results: !function utils.ai2d_process_results metadata: - version: 0.0 model_specific_prompt_kwargs: default: + prompt_format: mcq pre_prompt: "" post_prompt: "\nAnswer with the option's letter from the given choices directly." - + # qwen formulate ai2d as question answering instead of mcq + qwen_vl: + prompt_format: qa + pre_prompt: "" + post_prompt: " Answer:" + +model_specific_target_kwargs: + default: "mcq" + qwen_vl: "qa" \ No newline at end of file diff --git a/lmms_eval/tasks/ai2d/utils.py b/lmms_eval/tasks/ai2d/utils.py index 9c745fb8..0549fbab 100644 --- a/lmms_eval/tasks/ai2d/utils.py +++ b/lmms_eval/tasks/ai2d/utils.py @@ -1,32 +1,27 @@ def ai2d_doc_to_text(doc, model_specific_prompt_kwargs=None): question, choices = doc["question"], doc["options"] len_choices = len(choices) - options = [chr(ord("A") + i) for i in range(len_choices)] - choices_str = "\n".join([f"{option}. {choice}" for option, choice in zip(options, choices)]) - post_prompt = model_specific_prompt_kwargs["post_prompt"] pre_prompt = model_specific_prompt_kwargs["pre_prompt"] - return f"{pre_prompt}{question}\n{choices_str}{post_prompt}" + if model_specific_prompt_kwargs["prompt_format"] == "mcq": + options = [chr(ord("A") + i) for i in range(len_choices)] + choices_str = "\n".join([f"{option}. {choice}" for option, choice in zip(options, choices)]) + return f"{pre_prompt}{question}\n{choices_str}{post_prompt}" + elif model_specific_prompt_kwargs["prompt_format"] == "qa": + options = "\n".join(choices) + return f"{pre_prompt}{question}{options}{post_prompt}" + else: + raise ValueError(f"Unknown prompt format: {model_specific_prompt_kwargs['prompt_format']}") def ai2d_doc_to_visual(doc): return [doc["image"].convert("RGB")] -def ai2d_doc_to_target(doc): - len_choices = len(doc["options"]) - options = [chr(ord("A") + i) for i in range(len_choices)] - return options[int(doc["answer"])] - - -def ai2d_process_results(doc, results): - # I know this is weird, but it's how llava parse it. - target = ai2d_doc_to_target(doc) - pred = results[0] - if pred == target: - return {"exact_match": 1.0} - # pattern: ^[A-Z]\. .* - if len(pred) >= 2 and pred[0].isupper() and pred[1] == ".": - result = 1.0 if pred[0] == target else 0.0 - return {"exact_match": result} - return {"exact_match": 0.0} +def ai2d_doc_to_target(doc, model_specific_target_kwargs): + if model_specific_target_kwargs == "mcq": + len_choices = len(doc["options"]) + options = [chr(ord("A") + i) for i in range(len_choices)] + return options[int(doc["answer"])] + elif model_specific_target_kwargs == "qa": + return doc["options"][int(doc["answer"])] diff --git a/lmms_eval/tasks/chartqa/chartqa.yaml b/lmms_eval/tasks/chartqa/chartqa.yaml index d7bea6d7..e8b7e82f 100644 --- a/lmms_eval/tasks/chartqa/chartqa.yaml +++ b/lmms_eval/tasks/chartqa/chartqa.yaml @@ -30,5 +30,5 @@ model_specific_prompt_kwargs: post_prompt: "\nAnswer the question with a single word." qwen_vl: pre_prompt: "" - post_prompt: "" + post_prompt: " Answer:" diff --git a/lmms_eval/tasks/chartqa/utils.py b/lmms_eval/tasks/chartqa/utils.py index 786a8c87..99de0989 100644 --- a/lmms_eval/tasks/chartqa/utils.py +++ b/lmms_eval/tasks/chartqa/utils.py @@ -2,10 +2,10 @@ def chartqa_doc_to_visual(doc): return [doc["image"].convert("RGB")] -def chartqa_doc_to_text(doc, mdoel_specific_prompt_kwargs): +def chartqa_doc_to_text(doc, model_specific_prompt_kwargs): question = doc["question"] - pre_prompt = mdoel_specific_prompt_kwargs["pre_prompt"] - post_prompt = mdoel_specific_prompt_kwargs["post_prompt"] + pre_prompt = model_specific_prompt_kwargs["pre_prompt"] + post_prompt = model_specific_prompt_kwargs["post_prompt"] return f"{pre_prompt}{question}{post_prompt}" diff --git a/lmms_eval/tasks/docvqa/docvqa_val.yaml b/lmms_eval/tasks/docvqa/docvqa_val.yaml index 8e7c6d7f..0080c5f0 100644 --- a/lmms_eval/tasks/docvqa/docvqa_val.yaml +++ b/lmms_eval/tasks/docvqa/docvqa_val.yaml @@ -20,4 +20,6 @@ model_specific_prompt_kwargs: default: pre_prompt: "" post_prompt: "\nAnswer the question using a single word or phrase." - \ No newline at end of file + qwen_vl: + pre_prompt: "" + post_prompt: " Answer:" \ No newline at end of file diff --git a/lmms_eval/tasks/docvqa/utils.py b/lmms_eval/tasks/docvqa/utils.py index d4051b1e..a92e492b 100644 --- a/lmms_eval/tasks/docvqa/utils.py +++ b/lmms_eval/tasks/docvqa/utils.py @@ -6,10 +6,10 @@ def docvqa_doc_to_visual(doc): return [doc["image"].convert("RGB")] -def docvqa_doc_to_text(doc, mdoel_specific_prompt_kwargs): +def docvqa_doc_to_text(doc, model_specific_prompt_kwargs): question = doc["question"] - pre_prompt = mdoel_specific_prompt_kwargs["pre_prompt"] - post_prompt = mdoel_specific_prompt_kwargs["post_prompt"] + pre_prompt = model_specific_prompt_kwargs["pre_prompt"] + post_prompt = model_specific_prompt_kwargs["post_prompt"] return f"{pre_prompt}{question}{post_prompt}" diff --git a/lmms_eval/tasks/gqa/gqa.yaml b/lmms_eval/tasks/gqa/gqa.yaml index a1fa2fe2..404e5a05 100644 --- a/lmms_eval/tasks/gqa/gqa.yaml +++ b/lmms_eval/tasks/gqa/gqa.yaml @@ -22,3 +22,11 @@ metric_list: ignore_punctuation: true metadata: - version: 0.0 + +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "\nAnswer the question using a single word or phrase." + qwen_vl: + pre_prompt: "" + post_prompt: " Answer:" \ No newline at end of file diff --git a/lmms_eval/tasks/gqa/utils.py b/lmms_eval/tasks/gqa/utils.py index 4413fb97..5d74d6fb 100644 --- a/lmms_eval/tasks/gqa/utils.py +++ b/lmms_eval/tasks/gqa/utils.py @@ -1,6 +1,5 @@ from datasets import load_dataset -prompt = "\nAnswer the question using a single word or phrase." GQA_RAW_IMAGE_DATASET = None GQA_ID2IMAGE = None @@ -17,6 +16,8 @@ def gqa_doc_to_visual(doc): return [image] -def gqa_doc_to_text(doc): +def gqa_doc_to_text(doc, model_specific_prompt_kwargs): question = doc["question"] - return f"{question}{prompt}" + pre_prompt = model_specific_prompt_kwargs["pre_prompt"] + post_prompt = model_specific_prompt_kwargs["post_prompt"] + return f"{pre_prompt}{question}{post_prompt}" diff --git a/lmms_eval/tasks/infovqa/utils.py b/lmms_eval/tasks/infovqa/utils.py index 55412b42..33ba45eb 100644 --- a/lmms_eval/tasks/infovqa/utils.py +++ b/lmms_eval/tasks/infovqa/utils.py @@ -6,10 +6,10 @@ def infovqa_doc_to_visual(doc): return [doc["image"].convert("RGB")] -def infovqa_doc_to_text(doc, mdoel_specific_prompt_kwargs): +def infovqa_doc_to_text(doc, model_specific_prompt_kwargs): question = doc["question"] - pre_prompt = mdoel_specific_prompt_kwargs["pre_prompt"] - post_prompt = mdoel_specific_prompt_kwargs["post_prompt"] + pre_prompt = model_specific_prompt_kwargs["pre_prompt"] + post_prompt = model_specific_prompt_kwargs["post_prompt"] return f"{pre_prompt}{question}{post_prompt}" diff --git a/lmms_eval/tasks/scienceqa_img/scienceqa.yaml b/lmms_eval/tasks/scienceqa_img/scienceqa.yaml index d441a15b..8190e3e9 100644 --- a/lmms_eval/tasks/scienceqa_img/scienceqa.yaml +++ b/lmms_eval/tasks/scienceqa_img/scienceqa.yaml @@ -24,8 +24,12 @@ metadata: model_specific_prompt_kwargs: default: + format: default pre_prompt: "" post_prompt: "\nAnswer with the option's letter from the given choices directly." + qwen_vl: + format: qwen_vl + model_specific_generation_kwargs: llava: image_aspect_ratio: original diff --git a/lmms_eval/tasks/scienceqa_img/scienceqa_img.yaml b/lmms_eval/tasks/scienceqa_img/scienceqa_img.yaml index 2df15002..38086b74 100644 --- a/lmms_eval/tasks/scienceqa_img/scienceqa_img.yaml +++ b/lmms_eval/tasks/scienceqa_img/scienceqa_img.yaml @@ -24,8 +24,11 @@ metadata: model_specific_prompt_kwargs: default: + format: default pre_prompt: "" post_prompt: "\nAnswer with the option's letter from the given choices directly." + qwen_vl: + format: qwen_vl model_specific_generation_kwargs: llava: image_aspect_ratio: original diff --git a/lmms_eval/tasks/scienceqa_img/utils.py b/lmms_eval/tasks/scienceqa_img/utils.py index 57bf0d95..eed6b26a 100644 --- a/lmms_eval/tasks/scienceqa_img/utils.py +++ b/lmms_eval/tasks/scienceqa_img/utils.py @@ -3,12 +3,20 @@ def sqa_doc_to_text(doc, model_specific_prompt_kwargs=None): len_choices = len(choices) options = [chr(ord("A") + i) for i in range(len_choices)] choices_str = "\n".join([f"{option}. {choice}" for option, choice in zip(options, choices)]) - if context: - context = f"Context: {context}\n" + if model_specific_prompt_kwargs["format"] == "default": + if context: + context = f"Context: {context}\n" - post_prompt = model_specific_prompt_kwargs["post_prompt"] - pre_prompt = model_specific_prompt_kwargs["pre_prompt"] - return f"{pre_prompt}{context}{question}\n{choices_str}{post_prompt}" + post_prompt = model_specific_prompt_kwargs["post_prompt"] + pre_prompt = model_specific_prompt_kwargs["pre_prompt"] + return f"{pre_prompt}{context}{question}\n{choices_str}{post_prompt}" + elif model_specific_prompt_kwargs["format"] == "qwen_vl": + prompt = "Context: {}\nQuestion: {}\nOptions: {}\nAnswer:" + context = context if context else "N/A" + prompt = prompt.format(context, question, choices_str) + return prompt + else: + raise ValueError(f"Unknown prompt format: {model_specific_prompt_kwargs}") def sqa_doc_to_visual(doc):