diff --git a/scripts/hf_eval.py b/scripts/hf_eval.py index 3c0a4a942..48fb7fdf0 100644 --- a/scripts/hf_eval.py +++ b/scripts/hf_eval.py @@ -1,4 +1,5 @@ import torch +from tabulate import tabulate from transformers import AutoModelForCausalLM, AutoTokenizer try: @@ -9,17 +10,7 @@ print(""" Error: The 'lm_eval' module was not found. To install, follow these steps: - -1. Clone the repository: - git clone https://github.com/EleutherAI/lm-evaluation-harness - -2. Change to the cloned directory: - cd lm-evaluation-harness - -3. Install the package in editable mode: - pip install -e . - -After installation, re-run this script to use the LM Evaluation Harness. +pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git """) raise # Re-raise the ImportError @@ -33,6 +24,21 @@ torch._inductor.config.force_fuse_int_mm_with_mul = True torch._inductor.config.fx_graph_cache = True +def pretty_print_nested_results(results, precision: int = 6): + def format_value(value): + if isinstance(value, float): + return f"{value:.{precision}f}" + return value + + main_table = [] + for task, metrics in results["results"].items(): + subtable = [[k, format_value(v)] for k, v in metrics.items() if k != 'alias'] + subtable.sort(key=lambda x: x[0]) # Sort metrics alphabetically + formatted_subtable = tabulate(subtable, tablefmt='grid') + main_table.append([task, formatted_subtable]) + + print(tabulate(main_table, headers=['Task', 'Metrics'], tablefmt='grid')) + def run_evaluation(repo_id, tasks, limit, device, precision, quantization, compile, batch_size, max_length): tokenizer = AutoTokenizer.from_pretrained(repo_id) @@ -50,7 +56,6 @@ def run_evaluation(repo_id, tasks, limit, device, precision, quantization, compi change_linear_weights_to_int4_woqtensors(model.to(device=device)) elif quantization == "autoquant": model = autoquant(model.to(device=device)) - with torch.no_grad(): result = evaluate( HFLM( @@ -61,8 +66,8 @@ def run_evaluation(repo_id, tasks, limit, device, precision, quantization, compi get_task_dict(tasks), limit = limit, ) - for task, res in result["results"].items(): - print(f"{task}: {res}") + + pretty_print_nested_results(result) if __name__ == '__main__':