diff --git a/scripts/hf_eval.py b/scripts/hf_eval.py
index 3c0a4a942..48fb7fdf0 100644
--- a/scripts/hf_eval.py
+++ b/scripts/hf_eval.py
@@ -1,4 +1,5 @@
 import torch
+from tabulate import tabulate
 
 from transformers import AutoModelForCausalLM, AutoTokenizer
 try:
@@ -9,17 +10,7 @@
     print("""
 Error: The 'lm_eval' module was not found.
 To install, follow these steps:
-
-1. Clone the repository:
-   git clone https://github.com/EleutherAI/lm-evaluation-harness
-
-2. Change to the cloned directory:
-   cd lm-evaluation-harness
-
-3. Install the package in editable mode:
-   pip install -e .
-
-After installation, re-run this script to use the LM Evaluation Harness.
+pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git
 """)
     raise  # Re-raise the ImportError
 
@@ -33,6 +24,21 @@
 torch._inductor.config.force_fuse_int_mm_with_mul = True
 torch._inductor.config.fx_graph_cache = True
 
+def pretty_print_nested_results(results, precision: int = 6):
+    def format_value(value):
+        if isinstance(value, float):
+            return f"{value:.{precision}f}"
+        return value
+
+    main_table = []
+    for task, metrics in results["results"].items():
+        subtable = [[k, format_value(v)] for k, v in metrics.items() if k != 'alias']
+        subtable.sort(key=lambda x: x[0])  # Sort metrics alphabetically
+        formatted_subtable = tabulate(subtable, tablefmt='grid')
+        main_table.append([task, formatted_subtable])
+    
+    print(tabulate(main_table, headers=['Task', 'Metrics'], tablefmt='grid'))
+
 def run_evaluation(repo_id, tasks, limit, device, precision, quantization, compile, batch_size, max_length):
 
     tokenizer = AutoTokenizer.from_pretrained(repo_id)
@@ -50,7 +56,6 @@ def run_evaluation(repo_id, tasks, limit, device, precision, quantization, compi
         change_linear_weights_to_int4_woqtensors(model.to(device=device))
     elif quantization == "autoquant":
         model = autoquant(model.to(device=device))
-
     with torch.no_grad():
         result = evaluate(
             HFLM(
@@ -61,8 +66,8 @@ def run_evaluation(repo_id, tasks, limit, device, precision, quantization, compi
             get_task_dict(tasks),
             limit = limit,
         )
-    for task, res in result["results"].items():
-        print(f"{task}: {res}")
+
+        pretty_print_nested_results(result)
 
 
 if __name__ == '__main__':