FromLog Model (EvolvingLMMs-Lab#114)

* fix * fix * fix * fix logs * fix * lint
lorenzomammana · Jun 6, 2024 · c3b2ef4 · c3b2ef4
1 parent 722f6d6
commit c3b2ef4
Show file tree

Hide file tree

Showing 6 changed files with 76 additions and 20 deletions.
diff --git a/lmms_eval/__main__.py b/lmms_eval/__main__.py
@@ -322,7 +322,7 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:
                 for task_name, config in results["configs"].items():
                     filename = args.output_path.joinpath(f"{task_name}.json")
                     # Structure the data with 'args' and 'logs' keys
-                    data_to_dump = {"args": vars(args), "model_configs": config, "logs": sorted(samples[task_name], key=lambda x: x["doc_id"])}  # Convert Namespace to dict
+                    data_to_dump = {"args": vars(args), "model_configs": config, "logs": sorted(samples[task_name], key=lambda x: x["doc_id"]), "time": datetime_str}
                     samples_dumped = json.dumps(data_to_dump, indent=4, default=_handle_non_serializable, ensure_ascii=False)
                     filename.open("w", encoding="utf-8").write(samples_dumped)
                     eval_logger.info(f"Saved samples to {filename}")

diff --git a/lmms_eval/models/from_log.py b/lmms_eval/models/from_log.py
@@ -1,6 +1,9 @@
 import logging
 import json
+import os
+import re
 
+from datetime import datetime
 from typing import List, Tuple
 from tqdm import tqdm
 from lmms_eval.api.registry import register_model
@@ -15,24 +18,75 @@
 class FromLog(lmms):
     def __init__(
         self,
-        log_file="",
+        logs: str = "logs",
+        model_name: str = None,
+        model_args: str = None,
+        have_limits: bool = False,
         **kwargs,
     ) -> None:
         super().__init__()
 
         self.logs = {}
 
-        with open(log_file, "r") as f:
-            log_data = json.load(f)
+        log_folders = logs.split(",")
 
-        for data in log_data["logs"]:
-            id = data["doc_id"]
-            response = data["resps"][0]
-            self.logs[id] = response
+        def matched_model(_model_args):
+            if model_name and model_name != _model_args["model"]:
+                return False
+
+            if model_args:
+                _model_args_list = model_args.split(",")
+
+                for _model_arg in _model_args_list:
+                    if _model_arg not in _model_args["model_args"]:
+                        return False
+
+            if not have_limits and _model_args["limit"] is not None:
+                return False
+
+            return True
+
+        for log_folder in log_folders:
+            for root, dirs, files in os.walk(log_folder):
+                for file in files:
+                    if file.endswith(".json"):
+                        try:
+                            log_file = os.path.join(root, file)
+
+                            with open(log_file, "r") as f:
+                                log_data = json.load(f)
+
+                            # check if model is matched
+                            _model_args = log_data["args"]
+                            if not matched_model(_model_args):
+                                raise Exception("Model not matched")
+
+                            # load logs
+                            logs = {}
+                            for data in log_data["logs"]:
+                                id = data["doc_id"]
+                                response = data["resps"][0]
+                                logs[id] = response
+
+                            task = log_data["model_configs"]["task"]
+
+                            pattern = re.compile(r"\d{4}_\d{4}")
+
+                            if "time" in log_data:
+                                log_time = log_data["time"]
+                            elif pattern.search(os.path.abspath(log_file)):
+                                log_time = pattern.findall(os.path.abspath(log_file))[-1]
+                            else:
+                                log_time = "unknown"
+
+                            if task not in self.logs or (self.logs[task]["time"] == "unknown" or datetime.strptime(log_time, "%m%d_%H%M") > datetime.strptime(self.logs[task]["time"], "%m%d_%H%M")):
+                                self.logs[task] = {"time": log_time, "logs": logs}
+
+                        except Exception as e:
+                            pass
 
         accelerator = Accelerator()
         if accelerator.num_processes > 1:
-            assert self.continual_mode is False, "Continual mode is not supported with distributed inference."
             assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported."
             self.accelerator = accelerator
             if self.accelerator.is_local_main_process:
@@ -51,7 +105,7 @@ def generate_until(self, requests) -> List[str]:
         pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding")
 
         for contexts, gen_kwargs, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]:
-            response = self.logs[doc_id]
+            response = self.logs[task]["logs"][doc_id]
             res.append(response[0])
             pbar.update(1)
 
@@ -60,4 +114,4 @@ def generate_until(self, requests) -> List[str]:
 
     def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
         # TODO
-        assert False, "Gemini API not support"
+        assert False, "not support"
diff --git a/lmms_eval/tasks/cvrr/utils.py b/lmms_eval/tasks/cvrr/utils.py
@@ -240,6 +240,7 @@ def cvrr_print_scores(eval_file_path, args, task):
 
     return accuracy, average_score
 
+
 # Process result for evaluation in temporal task
 def cvrr_process_results(doc, result):
     """
@@ -267,8 +268,8 @@ def cvrr_process_results(doc, result):
 
     return {
         "gpt_eval_score": {"VideoID": doc["VideoID"], "Q": doc["Q"], "A": doc["A"], "pred": pred, "DimensionName": doc["DimensionName"], "correctness": correctness, "score": score, "reason": reason},
-        "gpt_eval_accuracy": {"VideoID": doc["VideoID"], "Q": doc["Q"], "A": doc["A"], "pred": pred, "DimensionName": doc["DimensionName"], "correctness": correctness, "score": score, "reason": reason}
-        }
+        "gpt_eval_accuracy": {"VideoID": doc["VideoID"], "Q": doc["Q"], "A": doc["A"], "pred": pred, "DimensionName": doc["DimensionName"], "correctness": correctness, "score": score, "reason": reason},
+    }
 
 
 def cvrr_gpt_eval(result_file_path, args, task):
@@ -410,6 +411,7 @@ def cvrr_aggregate_results_dim11(results, args):
     accuracy, average_score = cvrr_print_scores(eval_file_path, args, "unusual_and_physically_anomalous_activities")
     return "acc: " + str(accuracy) + "%" + " score: " + str(average_score)
 
+
 # Factory into different aggregate
 def cvrr_aggregate_score(results, args):
     total_score = 0

diff --git a/lmms_eval/tasks/perceptiontest/utils.py b/lmms_eval/tasks/perceptiontest/utils.py
@@ -95,7 +95,7 @@ def perceptiontest_process_results_mc_ppl(doc, result):
 
 # Process result for generation
 def perceptiontest_process_results_mc(doc, result):
-    pred = result[0]# string prediction "A", "B", "C"
+    pred = result[0]  # string prediction "A", "B", "C"
 
     # Map the prediction to an index
     pred_to_index = {"A": 0, "B": 1, "C": 2}
@@ -125,5 +125,5 @@ def perceptiontest_aggregate_mc_ppl(results, args):
 
 
 def perceptiontest_doc_to_choice(doc):
-    #return [op.split(".")[1].strip() for op in doc["options"]]
+    # return [op.split(".")[1].strip() for op in doc["options"]]
     return [op for op in doc["options"]]
diff --git a/lmms_eval/tasks/videochatgpt/utils.py b/lmms_eval/tasks/videochatgpt/utils.py
@@ -131,7 +131,7 @@ def videochatgpt_process_results_generic(doc, result):
         score_detailed_orientation = parse_score(review_detailed_orientation)
         review_context, model_name = get_eval_generic(question, answer, pred, "context", 64)
         score_context = parse_score(review_context)
-        
+
     except Exception as e:
         eval_logger.error(f"Error for Question ID: {doc.get('question_id', 'Unknown')}: {e}")
         review = "Failed to Get a Proper Review."
@@ -176,7 +176,7 @@ def videochatgpt_process_results_temporal(doc, result):
 # Process result for generation in consistency task
 def videochatgpt_process_results_consistency(doc, result, full_docs=None):
     pred = result[0]
-    
+
     # if it is question_1, then assign prediction for the 1st question
     # else assign prediction for the 2nd question
     if doc["question_1"] != "None":
@@ -621,7 +621,7 @@ def videochatgpt_aggregate_score(results, args):
     # Iterate over the results to sum scores
     for result_dict in results:
         total_score += result_dict["score"]
-        
+
     average_score = total_score / len(results) if results else 0
     eval_logger.info(f"Average Score: {average_score}")
     return average_score
diff --git a/lmms_eval/tasks/youcook2/utils.py b/lmms_eval/tasks/youcook2/utils.py
@@ -75,8 +75,8 @@ def youcook2_aggregate_results(results, metric, **kwargs):
         if result["video"] not in vid2capid:
             vid2capid[result["video"]] = []
         vid2capid[result["video"]].append(uid)
-        cur_gts[uid] = [{'caption': result["answer"]}]
-        cur_res[uid] = [{'caption': result["pred"]}]
+        cur_gts[uid] = [{"caption": result["answer"]}]
+        cur_res[uid] = [{"caption": result["pred"]}]
         uid += 1
 
     eval_logger.info("tokenization...")