Skip to content

Commit

Permalink
FromLog Model (EvolvingLMMs-Lab#114)
Browse files Browse the repository at this point in the history
* fix

* fix

* fix

* fix logs

* fix

* lint
  • Loading branch information
pufanyi authored Jun 6, 2024
1 parent 722f6d6 commit c3b2ef4
Show file tree
Hide file tree
Showing 6 changed files with 76 additions and 20 deletions.
2 changes: 1 addition & 1 deletion lmms_eval/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,7 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:
for task_name, config in results["configs"].items():
filename = args.output_path.joinpath(f"{task_name}.json")
# Structure the data with 'args' and 'logs' keys
data_to_dump = {"args": vars(args), "model_configs": config, "logs": sorted(samples[task_name], key=lambda x: x["doc_id"])} # Convert Namespace to dict
data_to_dump = {"args": vars(args), "model_configs": config, "logs": sorted(samples[task_name], key=lambda x: x["doc_id"]), "time": datetime_str}
samples_dumped = json.dumps(data_to_dump, indent=4, default=_handle_non_serializable, ensure_ascii=False)
filename.open("w", encoding="utf-8").write(samples_dumped)
eval_logger.info(f"Saved samples to {filename}")
Expand Down
74 changes: 64 additions & 10 deletions lmms_eval/models/from_log.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import logging
import json
import os
import re

from datetime import datetime
from typing import List, Tuple
from tqdm import tqdm
from lmms_eval.api.registry import register_model
Expand All @@ -15,24 +18,75 @@
class FromLog(lmms):
def __init__(
self,
log_file="",
logs: str = "logs",
model_name: str = None,
model_args: str = None,
have_limits: bool = False,
**kwargs,
) -> None:
super().__init__()

self.logs = {}

with open(log_file, "r") as f:
log_data = json.load(f)
log_folders = logs.split(",")

for data in log_data["logs"]:
id = data["doc_id"]
response = data["resps"][0]
self.logs[id] = response
def matched_model(_model_args):
if model_name and model_name != _model_args["model"]:
return False

if model_args:
_model_args_list = model_args.split(",")

for _model_arg in _model_args_list:
if _model_arg not in _model_args["model_args"]:
return False

if not have_limits and _model_args["limit"] is not None:
return False

return True

for log_folder in log_folders:
for root, dirs, files in os.walk(log_folder):
for file in files:
if file.endswith(".json"):
try:
log_file = os.path.join(root, file)

with open(log_file, "r") as f:
log_data = json.load(f)

# check if model is matched
_model_args = log_data["args"]
if not matched_model(_model_args):
raise Exception("Model not matched")

# load logs
logs = {}
for data in log_data["logs"]:
id = data["doc_id"]
response = data["resps"][0]
logs[id] = response

task = log_data["model_configs"]["task"]

pattern = re.compile(r"\d{4}_\d{4}")

if "time" in log_data:
log_time = log_data["time"]
elif pattern.search(os.path.abspath(log_file)):
log_time = pattern.findall(os.path.abspath(log_file))[-1]
else:
log_time = "unknown"

if task not in self.logs or (self.logs[task]["time"] == "unknown" or datetime.strptime(log_time, "%m%d_%H%M") > datetime.strptime(self.logs[task]["time"], "%m%d_%H%M")):
self.logs[task] = {"time": log_time, "logs": logs}

except Exception as e:
pass

accelerator = Accelerator()
if accelerator.num_processes > 1:
assert self.continual_mode is False, "Continual mode is not supported with distributed inference."
assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported."
self.accelerator = accelerator
if self.accelerator.is_local_main_process:
Expand All @@ -51,7 +105,7 @@ def generate_until(self, requests) -> List[str]:
pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding")

for contexts, gen_kwargs, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]:
response = self.logs[doc_id]
response = self.logs[task]["logs"][doc_id]
res.append(response[0])
pbar.update(1)

Expand All @@ -60,4 +114,4 @@ def generate_until(self, requests) -> List[str]:

def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
# TODO
assert False, "Gemini API not support"
assert False, "not support"
6 changes: 4 additions & 2 deletions lmms_eval/tasks/cvrr/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,7 @@ def cvrr_print_scores(eval_file_path, args, task):

return accuracy, average_score


# Process result for evaluation in temporal task
def cvrr_process_results(doc, result):
"""
Expand Down Expand Up @@ -267,8 +268,8 @@ def cvrr_process_results(doc, result):

return {
"gpt_eval_score": {"VideoID": doc["VideoID"], "Q": doc["Q"], "A": doc["A"], "pred": pred, "DimensionName": doc["DimensionName"], "correctness": correctness, "score": score, "reason": reason},
"gpt_eval_accuracy": {"VideoID": doc["VideoID"], "Q": doc["Q"], "A": doc["A"], "pred": pred, "DimensionName": doc["DimensionName"], "correctness": correctness, "score": score, "reason": reason}
}
"gpt_eval_accuracy": {"VideoID": doc["VideoID"], "Q": doc["Q"], "A": doc["A"], "pred": pred, "DimensionName": doc["DimensionName"], "correctness": correctness, "score": score, "reason": reason},
}


def cvrr_gpt_eval(result_file_path, args, task):
Expand Down Expand Up @@ -410,6 +411,7 @@ def cvrr_aggregate_results_dim11(results, args):
accuracy, average_score = cvrr_print_scores(eval_file_path, args, "unusual_and_physically_anomalous_activities")
return "acc: " + str(accuracy) + "%" + " score: " + str(average_score)


# Factory into different aggregate
def cvrr_aggregate_score(results, args):
total_score = 0
Expand Down
4 changes: 2 additions & 2 deletions lmms_eval/tasks/perceptiontest/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def perceptiontest_process_results_mc_ppl(doc, result):

# Process result for generation
def perceptiontest_process_results_mc(doc, result):
pred = result[0]# string prediction "A", "B", "C"
pred = result[0] # string prediction "A", "B", "C"

# Map the prediction to an index
pred_to_index = {"A": 0, "B": 1, "C": 2}
Expand Down Expand Up @@ -125,5 +125,5 @@ def perceptiontest_aggregate_mc_ppl(results, args):


def perceptiontest_doc_to_choice(doc):
#return [op.split(".")[1].strip() for op in doc["options"]]
# return [op.split(".")[1].strip() for op in doc["options"]]
return [op for op in doc["options"]]
6 changes: 3 additions & 3 deletions lmms_eval/tasks/videochatgpt/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def videochatgpt_process_results_generic(doc, result):
score_detailed_orientation = parse_score(review_detailed_orientation)
review_context, model_name = get_eval_generic(question, answer, pred, "context", 64)
score_context = parse_score(review_context)

except Exception as e:
eval_logger.error(f"Error for Question ID: {doc.get('question_id', 'Unknown')}: {e}")
review = "Failed to Get a Proper Review."
Expand Down Expand Up @@ -176,7 +176,7 @@ def videochatgpt_process_results_temporal(doc, result):
# Process result for generation in consistency task
def videochatgpt_process_results_consistency(doc, result, full_docs=None):
pred = result[0]

# if it is question_1, then assign prediction for the 1st question
# else assign prediction for the 2nd question
if doc["question_1"] != "None":
Expand Down Expand Up @@ -621,7 +621,7 @@ def videochatgpt_aggregate_score(results, args):
# Iterate over the results to sum scores
for result_dict in results:
total_score += result_dict["score"]

average_score = total_score / len(results) if results else 0
eval_logger.info(f"Average Score: {average_score}")
return average_score
4 changes: 2 additions & 2 deletions lmms_eval/tasks/youcook2/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,8 @@ def youcook2_aggregate_results(results, metric, **kwargs):
if result["video"] not in vid2capid:
vid2capid[result["video"]] = []
vid2capid[result["video"]].append(uid)
cur_gts[uid] = [{'caption': result["answer"]}]
cur_res[uid] = [{'caption': result["pred"]}]
cur_gts[uid] = [{"caption": result["answer"]}]
cur_res[uid] = [{"caption": result["pred"]}]
uid += 1

eval_logger.info("tokenization...")
Expand Down

0 comments on commit c3b2ef4

Please sign in to comment.