From c3b2ef411e1d0ac08ffceef6783df854fbc8d66e Mon Sep 17 00:00:00 2001 From: Pu Fanyi Date: Thu, 6 Jun 2024 13:33:32 +0800 Subject: [PATCH] FromLog Model (#114) * fix * fix * fix * fix logs * fix * lint --- lmms_eval/__main__.py | 2 +- lmms_eval/models/from_log.py | 74 +++++++++++++++++++++---- lmms_eval/tasks/cvrr/utils.py | 6 +- lmms_eval/tasks/perceptiontest/utils.py | 4 +- lmms_eval/tasks/videochatgpt/utils.py | 6 +- lmms_eval/tasks/youcook2/utils.py | 4 +- 6 files changed, 76 insertions(+), 20 deletions(-) diff --git a/lmms_eval/__main__.py b/lmms_eval/__main__.py index 26339ef1..69f1cfd4 100755 --- a/lmms_eval/__main__.py +++ b/lmms_eval/__main__.py @@ -322,7 +322,7 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None: for task_name, config in results["configs"].items(): filename = args.output_path.joinpath(f"{task_name}.json") # Structure the data with 'args' and 'logs' keys - data_to_dump = {"args": vars(args), "model_configs": config, "logs": sorted(samples[task_name], key=lambda x: x["doc_id"])} # Convert Namespace to dict + data_to_dump = {"args": vars(args), "model_configs": config, "logs": sorted(samples[task_name], key=lambda x: x["doc_id"]), "time": datetime_str} samples_dumped = json.dumps(data_to_dump, indent=4, default=_handle_non_serializable, ensure_ascii=False) filename.open("w", encoding="utf-8").write(samples_dumped) eval_logger.info(f"Saved samples to {filename}") diff --git a/lmms_eval/models/from_log.py b/lmms_eval/models/from_log.py index be324608..4c573e0f 100644 --- a/lmms_eval/models/from_log.py +++ b/lmms_eval/models/from_log.py @@ -1,6 +1,9 @@ import logging import json +import os +import re +from datetime import datetime from typing import List, Tuple from tqdm import tqdm from lmms_eval.api.registry import register_model @@ -15,24 +18,75 @@ class FromLog(lmms): def __init__( self, - log_file="", + logs: str = "logs", + model_name: str = None, + model_args: str = None, + have_limits: bool = False, **kwargs, ) -> None: super().__init__() self.logs = {} - with open(log_file, "r") as f: - log_data = json.load(f) + log_folders = logs.split(",") - for data in log_data["logs"]: - id = data["doc_id"] - response = data["resps"][0] - self.logs[id] = response + def matched_model(_model_args): + if model_name and model_name != _model_args["model"]: + return False + + if model_args: + _model_args_list = model_args.split(",") + + for _model_arg in _model_args_list: + if _model_arg not in _model_args["model_args"]: + return False + + if not have_limits and _model_args["limit"] is not None: + return False + + return True + + for log_folder in log_folders: + for root, dirs, files in os.walk(log_folder): + for file in files: + if file.endswith(".json"): + try: + log_file = os.path.join(root, file) + + with open(log_file, "r") as f: + log_data = json.load(f) + + # check if model is matched + _model_args = log_data["args"] + if not matched_model(_model_args): + raise Exception("Model not matched") + + # load logs + logs = {} + for data in log_data["logs"]: + id = data["doc_id"] + response = data["resps"][0] + logs[id] = response + + task = log_data["model_configs"]["task"] + + pattern = re.compile(r"\d{4}_\d{4}") + + if "time" in log_data: + log_time = log_data["time"] + elif pattern.search(os.path.abspath(log_file)): + log_time = pattern.findall(os.path.abspath(log_file))[-1] + else: + log_time = "unknown" + + if task not in self.logs or (self.logs[task]["time"] == "unknown" or datetime.strptime(log_time, "%m%d_%H%M") > datetime.strptime(self.logs[task]["time"], "%m%d_%H%M")): + self.logs[task] = {"time": log_time, "logs": logs} + + except Exception as e: + pass accelerator = Accelerator() if accelerator.num_processes > 1: - assert self.continual_mode is False, "Continual mode is not supported with distributed inference." assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported." self.accelerator = accelerator if self.accelerator.is_local_main_process: @@ -51,7 +105,7 @@ def generate_until(self, requests) -> List[str]: pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding") for contexts, gen_kwargs, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]: - response = self.logs[doc_id] + response = self.logs[task]["logs"][doc_id] res.append(response[0]) pbar.update(1) @@ -60,4 +114,4 @@ def generate_until(self, requests) -> List[str]: def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: # TODO - assert False, "Gemini API not support" + assert False, "not support" diff --git a/lmms_eval/tasks/cvrr/utils.py b/lmms_eval/tasks/cvrr/utils.py index c1006306..1e8578a5 100755 --- a/lmms_eval/tasks/cvrr/utils.py +++ b/lmms_eval/tasks/cvrr/utils.py @@ -240,6 +240,7 @@ def cvrr_print_scores(eval_file_path, args, task): return accuracy, average_score + # Process result for evaluation in temporal task def cvrr_process_results(doc, result): """ @@ -267,8 +268,8 @@ def cvrr_process_results(doc, result): return { "gpt_eval_score": {"VideoID": doc["VideoID"], "Q": doc["Q"], "A": doc["A"], "pred": pred, "DimensionName": doc["DimensionName"], "correctness": correctness, "score": score, "reason": reason}, - "gpt_eval_accuracy": {"VideoID": doc["VideoID"], "Q": doc["Q"], "A": doc["A"], "pred": pred, "DimensionName": doc["DimensionName"], "correctness": correctness, "score": score, "reason": reason} - } + "gpt_eval_accuracy": {"VideoID": doc["VideoID"], "Q": doc["Q"], "A": doc["A"], "pred": pred, "DimensionName": doc["DimensionName"], "correctness": correctness, "score": score, "reason": reason}, + } def cvrr_gpt_eval(result_file_path, args, task): @@ -410,6 +411,7 @@ def cvrr_aggregate_results_dim11(results, args): accuracy, average_score = cvrr_print_scores(eval_file_path, args, "unusual_and_physically_anomalous_activities") return "acc: " + str(accuracy) + "%" + " score: " + str(average_score) + # Factory into different aggregate def cvrr_aggregate_score(results, args): total_score = 0 diff --git a/lmms_eval/tasks/perceptiontest/utils.py b/lmms_eval/tasks/perceptiontest/utils.py index c6ccdf3c..00f012c6 100755 --- a/lmms_eval/tasks/perceptiontest/utils.py +++ b/lmms_eval/tasks/perceptiontest/utils.py @@ -95,7 +95,7 @@ def perceptiontest_process_results_mc_ppl(doc, result): # Process result for generation def perceptiontest_process_results_mc(doc, result): - pred = result[0]# string prediction "A", "B", "C" + pred = result[0] # string prediction "A", "B", "C" # Map the prediction to an index pred_to_index = {"A": 0, "B": 1, "C": 2} @@ -125,5 +125,5 @@ def perceptiontest_aggregate_mc_ppl(results, args): def perceptiontest_doc_to_choice(doc): - #return [op.split(".")[1].strip() for op in doc["options"]] + # return [op.split(".")[1].strip() for op in doc["options"]] return [op for op in doc["options"]] diff --git a/lmms_eval/tasks/videochatgpt/utils.py b/lmms_eval/tasks/videochatgpt/utils.py index 6d68474d..899bb0a2 100755 --- a/lmms_eval/tasks/videochatgpt/utils.py +++ b/lmms_eval/tasks/videochatgpt/utils.py @@ -131,7 +131,7 @@ def videochatgpt_process_results_generic(doc, result): score_detailed_orientation = parse_score(review_detailed_orientation) review_context, model_name = get_eval_generic(question, answer, pred, "context", 64) score_context = parse_score(review_context) - + except Exception as e: eval_logger.error(f"Error for Question ID: {doc.get('question_id', 'Unknown')}: {e}") review = "Failed to Get a Proper Review." @@ -176,7 +176,7 @@ def videochatgpt_process_results_temporal(doc, result): # Process result for generation in consistency task def videochatgpt_process_results_consistency(doc, result, full_docs=None): pred = result[0] - + # if it is question_1, then assign prediction for the 1st question # else assign prediction for the 2nd question if doc["question_1"] != "None": @@ -621,7 +621,7 @@ def videochatgpt_aggregate_score(results, args): # Iterate over the results to sum scores for result_dict in results: total_score += result_dict["score"] - + average_score = total_score / len(results) if results else 0 eval_logger.info(f"Average Score: {average_score}") return average_score diff --git a/lmms_eval/tasks/youcook2/utils.py b/lmms_eval/tasks/youcook2/utils.py index b179833c..1761a8b1 100644 --- a/lmms_eval/tasks/youcook2/utils.py +++ b/lmms_eval/tasks/youcook2/utils.py @@ -75,8 +75,8 @@ def youcook2_aggregate_results(results, metric, **kwargs): if result["video"] not in vid2capid: vid2capid[result["video"]] = [] vid2capid[result["video"]].append(uid) - cur_gts[uid] = [{'caption': result["answer"]}] - cur_res[uid] = [{'caption': result["pred"]}] + cur_gts[uid] = [{"caption": result["answer"]}] + cur_res[uid] = [{"caption": result["pred"]}] uid += 1 eval_logger.info("tokenization...")