-
Notifications
You must be signed in to change notification settings - Fork 0
/
print.py
77 lines (58 loc) · 2.26 KB
/
print.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import unicodedata
import os
import json
from datasets import load_dataset
def generateLMJudge():
ds = load_dataset("manishiitg/llm_judge", split="train")
def is_hindi(char):
try:
return unicodedata.name(char).startswith('DEVANAGARI')
except ValueError:
return False
def contains_hindi(s):
return any(is_hindi(char) for char in s)
final_data = []
for row in ds:
final_data.append(row)
scores = {}
for row in final_data:
if not row["judgement_pending"] and row["rating"] != -1:
if row["type"] == "unalign":
continue
model_name = row["model_name"]
# if model_name in skip_model or "awq" in model_name:
# continue
lang = "en"
if contains_hindi(row["simple_prompt"]):
lang = "hi"
if model_name not in scores:
scores[model_name] = {}
if lang not in scores[model_name]:
scores[model_name][lang] = []
scores[model_name][lang].append(float(row["rating"]))
# Create a list to hold the model scores for sorting
model_scores = []
# Iterate over the models and calculate the average score
for model_name in scores:
for lang in scores[model_name]:
ratings = scores[model_name][lang]
avg = sum(ratings) / len(ratings)
model_scores.append((model_name, lang, avg, len(ratings)))
# Sort the model scores by average score in descending order
model_scores.sort(key=lambda x: x[2], reverse=True)
markdown_output = ""
langs = ["hi", "en"]
for l in langs:
# Generate the markdown output
markdown_output += f"#### LLM Judge Language: {l} \n"
markdown_output += f"| Model | Language | Score | No# Questions |\n"
markdown_output += "| --- | --- | --- | --- |\n"
for model_name, lang, avg, count in model_scores:
if lang == l:
markdown_output += f"| {model_name} | {lang} | {avg:.4f} | {count} |\n"
markdown_output += f"\n\n"
return markdown_output
# Convert JSON to Markdown table grouped by task and sub-task
markdown_output = generateLMJudge()
# Print the Markdown output
print(markdown_output)