for-ai · ljvmiranda921 · Oct 12, 2024 · Oct 11, 2024
diff --git a/analysis/avg_agreement_final.py b/analysis/avg_agreement_final.py
@@ -0,0 +1,92 @@
+import seaborn as sns
+import matplotlib.pyplot as plt
+import numpy as np
+
+data = {
+  "meta-llama/Meta-Llama-3.1-8B-Instruct": [
+    0.3533086666014079,
+    0.052422082615756406
+  ],
+  "cohere/c4ai-aya-23-35b": [
+    0.43767196047824003,
+    0.026040919354464294
+  ],
+  "cohere/c4ai-aya-23-8b": [
+    0.013483014909052663,
+    0.03363706833599835
+  ],
+  "cohere/command-r-08-2024": [
+    0.374457668650282,
+    0.02926089754079793
+  ],
+  "cohere/command-r-plus-08-2024": [
+    0.3830841816733316,
+    0.020185255968455686
+  ],
+  "google/gemma-1.1-7b-it": [
+    0.5190375637539242,
+    0.027757722654111305
+  ],
+  "google/gemma-2-9b-it": [
+    0.5181663123111222,
+    0.031090119385244894
+  ],
+  "meta-llama/Meta-Llama-3-70B-Instruct": [
+    0.5685224105896568,
+    0.04853344616275034
+  ],
+  "meta-llama/Meta-Llama-3-8B-Instruct": [
+    0.37936948540837095,
+    0.032172769265151994
+  ],
+  "meta-llama/Meta-Llama-3.1-70B-Instruct": [
+    0.603536768244583,
+    0.027191895488989915
+  ],
+  "mistralai/Mistral-7B-Instruct-v0.2": [
+    0.4071166722276529,
+    0.04577594028555328
+  ],
+  "mistralai/Mistral-7B-Instruct-v0.3": [
+    0.41195018984687265,
+    0.056184679972755454
+  ],
+  "openai/gpt-4-turbo-2024-04-09": [
+    0.6106943361444249,
+    0.02932446842558468
+  ],
+  "openai/gpt-4o-2024-05-13": [
+    0.5833874065757011,
+    0.023695391445384514
+  ]
+}
+
+sorted_data = dict(sorted(data.items(), key=lambda item: item[1][0]))
+labels_sorted = list(sorted_data.keys())
+means_sorted = [v[0] for v in sorted_data.values()]
+std_devs_sorted = [v[1] for v in sorted_data.values()]
+
+sns.set(style="whitegrid")
+palette = sns.color_palette("coolwarm", len(labels_sorted))
+
+plt.figure(figsize=(10, 6))
+x_pos_sorted = np.arange(len(labels_sorted))
+
+ax1 = sns.barplot(x=x_pos_sorted, y=means_sorted, palette=palette, errorbar=None)
+plt.errorbar(x_pos_sorted, means_sorted, yerr=std_devs_sorted, fmt='none', c='black', capsize=5)
+
+ax1.spines['top'].set_color('black')
+ax1.spines['right'].set_color('black')
+ax1.spines['left'].set_color('black')
+ax1.spines['bottom'].set_color('black')
+for spine in ax1.spines.values():
+    spine.set_linewidth(2)  # Make the border thicker
+
+plt.ylim(0, 0.8)
+
+plt.xticks(x_pos_sorted, labels_sorted, rotation=90)
+plt.ylabel("Cohen's Kappa")
+plt.title('Average Inner-Model Agreement Across Languages')
+
+plt.tight_layout()
+plt.savefig(f"./innermodel_agreement.pdf", bbox_inches='tight')
diff --git a/analysis/maple_results.py b/analysis/maple_results.py
@@ -0,0 +1,101 @@
+import json
+from pathlib import Path
+
+import argparse
+import logging
+from pathlib import Path
+from typing import Optional
+
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+from huggingface_hub import snapshot_download
+import datasets
+import json
+
+import numpy as np
+import matplotlib.pyplot as plt
+from itertools import combinations
+from collections import defaultdict
+
+
+FONT_SIZES = {"small": 12, "medium": 16, "large": 18}
+
+PLOT_PARAMS = {
+  "font.family": "serif",
+  "font.serif": ["Times New Roman", "STIX"],
+  "font.size": FONT_SIZES.get("medium"),
+  "axes.titlesize": FONT_SIZES.get("large"),
+  "axes.labelsize": FONT_SIZES.get("large"),
+  "xtick.labelsize": FONT_SIZES.get("large"),
+  "ytick.labelsize": FONT_SIZES.get("small"),
+  "legend.fontsize": FONT_SIZES.get("medium"),
+  "figure.titlesize": FONT_SIZES.get("medium"),
+  "text.usetex": False,
+}
+
+logging.basicConfig(level=logging.INFO)
+
+plt.rcParams.update(PLOT_PARAMS)
+
+def load_json(json_file_path):
+  with open(json_file_path, "r") as file:
+    json_data = json.load(file)
+  return json_data
+
+results_dir = 'data/eval-results-maple'
+results_path = Path(results_dir)
+
+results_all = []
+for result_file in results_path.glob("*.json"):
+  raw_results = load_json(result_file)
+  if "leaderboard" in raw_results.keys():
+    model_id = raw_results["model"]
+    subset_results = raw_results['subset']
+    overall = raw_results['scores']['accuracy']
+    remove_key = ['model', 'model_type', 'chat_template']
+    for key in remove_key:
+      del subset_results[key]
+  elif "subset_results" in raw_results.keys():
+    model_id = raw_results["model"]
+    subset_results = raw_results['subset_results']
+    overall = raw_results['accuracy']
+  else:
+    model_id = raw_results["model"]
+    subset_results = raw_results['extra_results']
+    overall = raw_results['accuracy']
+  # print(model_id, overall)
+  # print("\t", subset_results)
+  # results_all.append([model_id, overall, subset_results])
+  results_all.append({'Model': model_id, 'Avg': overall, **subset_results})
+
+  # import ipdb; ipdb.set_trace()
+
+TOP = 10  
+# results_all.sort(key=lambda x: x[1], reverse=True)
+# results_all = results_all[:TOP]
+# print(results_all)
+
+df_results = pd.DataFrame(results_all)
+df_results = df_results.sort_values(by='Avg', ascending=False).reset_index(drop=True)
+df_results = df_results.head(10).reset_index(drop=True)
+
+df_results.columns = df_results.columns.str.replace('^maple-', '', regex=True)
+df_results = df_results.set_index("Model")
+df_results = df_results * 100
+fig, ax = plt.subplots(1, 1, figsize=(18, 5))
+
+sns.heatmap(df_results, ax=ax, cmap="YlGn", annot=True, annot_kws={"size": 16}, 
+            fmt=".1f", cbar=False)
+
+ax.xaxis.set_ticks_position("top")
+ax.tick_params(axis="x", labelrotation=45)
+ax.set_ylabel("")
+ax.set_yticklabels([f"{model}     " for model in df_results.index])
+
+plt.tight_layout()
+
+plt.savefig("plots/maple.pdf", bbox_inches="tight")
+# import ipdb; ipdb.set_trace()
+
+