Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BFCL] Improve Warning Message when Aggregating Results #517

Merged
merged 19 commits into from
Aug 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
e6a1f7b
add print statement
HuanzhiMao Jul 9, 2024
3651cfe
update eval_runner_helper with the function check_all_category_present
HuanzhiMao Jul 10, 2024
117207d
fix wording
HuanzhiMao Jul 11, 2024
09c7bdc
Merge branch 'main' into data-aggregate-warning
HuanzhiMao Jul 17, 2024
9bd7760
Merge branch 'main' into data-aggregate-warning
HuanzhiMao Jul 18, 2024
eff4819
Merge branch 'main' into data-aggregate-warning
HuanzhiMao Jul 31, 2024
bbb9b63
Merge remote-tracking branch 'upstream/main' into data-aggregate-warning
CharlieJCJ Jul 31, 2024
30517a0
Merge remote-tracking branch 'refs/remotes/origin/data-aggregate-warn…
CharlieJCJ Jul 31, 2024
a06bc54
[add] actionable eval debug messages
CharlieJCJ Jul 31, 2024
a61d392
[add] in series of commands
CharlieJCJ Jul 31, 2024
4f62912
Merge branch 'main' into data-aggregate-warning
HuanzhiMao Aug 4, 2024
476a595
[add] change to give warning message to models and test categories th…
CharlieJCJ Aug 7, 2024
324b463
Merge remote-tracking branch 'refs/remotes/origin/data-aggregate-warn…
CharlieJCJ Aug 7, 2024
7599110
Merge branch 'main' into data-aggregate-warning
HuanzhiMao Aug 7, 2024
83ead57
[fix] make all_category a set
CharlieJCJ Aug 7, 2024
aab79d9
Merge branch 'main' into data-aggregate-warning
HuanzhiMao Aug 7, 2024
16a80d3
[fix] address issues when result folder doesn't contain the requested…
CharlieJCJ Aug 8, 2024
a8e55b5
disable message when --model is not provided
HuanzhiMao Aug 8, 2024
2605080
Merge branch 'main' into data-aggregate-warning
HuanzhiMao Aug 8, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -404,7 +404,7 @@ def runner(model_names, test_categories, api_sanity_check):
# This is helpful when you only want to run the evaluation for a subset of models and test categories.
update_leaderboard_table_with_score_file(LEADERBOARD_TABLE, OUTPUT_PATH)
# Write the leaderboard table to a file
generate_leaderboard_csv(LEADERBOARD_TABLE, OUTPUT_PATH)
generate_leaderboard_csv(LEADERBOARD_TABLE, OUTPUT_PATH, model_names, test_categories)

# Clean up the executable expected output files
# They should be re-generated the next time the evaluation is run
Expand Down
134 changes: 123 additions & 11 deletions berkeley-function-call-leaderboard/eval_checker/eval_runner_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -615,6 +615,8 @@
# Reference: https://azure.microsoft.com/en-us/pricing/details/machine-learning/
V100_x8_PRICE_PER_HOUR = 22.032

RED_FONT = "\033[91m"
RESET = "\033[0m"

def extract_after_test(input_string):
parts = input_string.split("_test_")[1].split("_result")[0].split(".json")[0]
Expand Down Expand Up @@ -792,10 +794,7 @@ def display_api_status_error(rest_error, executable_error, display_success=False
if display_success:
print("🟢 All API Status Test Passed!")
return None

RED_FONT = "\033[91m"
RESET = "\033[0m"


print(f"\n{RED_FONT}{'-' * 18} Executable Categories' Error Bounds Based on API Health Status {'-' * 18}{RESET}\n")

if rest_error:
Expand Down Expand Up @@ -953,7 +952,10 @@ def get_metric(model_name, cost_data, latency_data):
return cost, mean_latency, std_latency, percentile_95_latency


def generate_leaderboard_csv(leaderboard_table, output_path):
def generate_leaderboard_csv(
leaderboard_table, output_path, eval_models=None, eval_categories=None
):
print("📈 Aggregating data to generate leaderboard score table...")
data = []
for model_name, value in leaderboard_table.items():
model_name_escaped = model_name.replace("_", "/")
Expand Down Expand Up @@ -1027,12 +1029,6 @@ def generate_leaderboard_csv(leaderboard_table, output_path):
model_name_escaped, cost_data, latency_data
)

if overall_accuracy["total_count"] != 1700:
print("-" * 100)
print(
f"❗️Warning: Total count for {model_name} is {overall_accuracy['total_count']}"
)

data.append(
[
"N/A",
Expand Down Expand Up @@ -1083,6 +1079,122 @@ def generate_leaderboard_csv(leaderboard_table, output_path):
else:
f.write(",".join(row))

if eval_models:
category_status = check_model_category_status(score_path=output_path)
check_all_category_present(
category_status, eval_models=eval_models, eval_categories=eval_categories
)


def check_model_category_status(score_path):
result_path = score_path.replace("score", "result")

leaderboard_categories = [
"simple",
"multiple_function",
"parallel_function",
"parallel_multiple_function",
"executable_simple",
"executable_multiple_function",
"executable_parallel_function",
"executable_parallel_multiple_function",
"java",
"javascript",
"rest",
"relevance",
]

category_status = {}

# Check for all models in MODEL_METADATA_MAPPING
for model_name in MODEL_METADATA_MAPPING.keys():
category_status[model_name] = {
category: {"generated": False, "evaluated": False}
for category in leaderboard_categories
}

# Check result folder
result_subdir = os.path.join(result_path, model_name)
if os.path.exists(result_subdir):
for result_file in os.listdir(result_subdir):
test_category = result_file.split("_test_")[1].split("_result")[0]
if test_category in category_status[model_name]:
category_status[model_name][test_category]["generated"] = True

# Check score folder
score_subdir = os.path.join(score_path, model_name)
if os.path.exists(score_subdir):
for score_file in os.listdir(score_subdir):
test_category = score_file.split("_score.json")[0]
if test_category in category_status[model_name]:
category_status[model_name][test_category]["evaluated"] = True

return category_status


def check_all_category_present(category_status, eval_models=None, eval_categories=None):
found_issues = False
first_time = True
commands = []

for model_name, categories in category_status.items():
if eval_models and model_name not in eval_models:
continue

not_generated = [
cat
for cat, status in categories.items()
if not status["generated"]
and (not eval_categories or cat in eval_categories)
]
not_evaluated = [
cat
for cat, status in categories.items()
if not status["evaluated"]
and (not eval_categories or cat in eval_categories)
]

if not_generated or not_evaluated:
found_issues = True
if first_time:
print(f"We are checking models: {eval_models} and categories: {eval_categories}")
print(f"\n{RED_FONT}{'=' * 30} Model Category Status {'=' * 30}{RESET}")
first_time = False

print(f"{RED_FONT}Model: {model_name}{RESET}")
if not_generated:
print(f"\n Missing results for {len(not_generated)} categories:")
for cat in not_generated:
print(f" - {cat}")
commands.append("cd ..")
commands.append(
f"python openfunctions_evaluation.py --model {model_name} --test-category {' '.join(not_generated)}"
)

if not_evaluated:
print(f"\n Unevaluated results for {len(not_evaluated)} categories:")
for cat in not_evaluated:
print(f" - {cat}")

all_categories = set(not_generated + not_evaluated)
if all_categories:
commands.append("cd eval_checker")
commands.append(
f"python eval_runner.py --model {model_name} --test-category {' '.join(all_categories)}"
)

if found_issues:
print(f"\n{RED_FONT}{'=' * 40} Recommended Actions {'=' * 40}{RESET}\n")
print(
"To address these issues, run the following commands from the current directory:"
)
print("\n" + " && \\\n".join(commands))
print(f"\n{RED_FONT}{'=' * 100}{RESET}\n")
else:
print("🎉 All categories are present and evaluated for all models!\n")

return found_issues


def update_leaderboard_table_with_score_file(leaderboard_table, score_path):

Expand Down