-
Notifications
You must be signed in to change notification settings - Fork 0
/
plot_scores.py
126 lines (105 loc) · 4.68 KB
/
plot_scores.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import seaborn as sns
import json
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import pandas as pd
import pingouin as ping
file_mapping = {"Self-perfecting": "output/feedback_qesconv_dpo_model_generations_0_sft_model_probs.json",
"SFT": "output/feedback_qesconv_sft_model_generations_0_sft_model_probs.json",
"+ generations": "output/feedback_qesconv_ablation_data_model_generations_0_sft_model_probs.json",
"+ best generations": "output/feedback_qesconv_ablation_preference_model_generations_0_sft_model_probs.json", }
def get_scores(dist, worst_percent=0.1):
file_name = file_mapping[dist]
with open(file_name) as f:
data = json.load(f)
prob_list = []
for prompt in data:
for output in prompt['output']:
if 'improved' in output:
prob_list.append(output['improved']['prob'])
# return worst generations
prob_list.sort()
prob_list = prob_list[:int(len(prob_list)*worst_percent)]
return prob_list
def plot_hist_two_samples(dist_1, dist_2, worst_percent=0.1, num_bins=30, ylim=0.33):
scores_1 = get_scores(dist_1, worst_percent)
scores_2 = get_scores(dist_2, worst_percent)
col_mapping = {"Self-perfecting": "orange",
"SFT": "grey",
"+ generations": "violet",
"+ best generations": "turquoise", }
col1 = col_mapping[dist_1]
col2 = col_mapping[dist_2]
# if Self-perfecting rename to Self-improved
if dist_1 == "Self-perfecting":
dist_1 = "Self-improved"
if dist_2 == "Self-perfecting":
dist_2 = "Self-improved"
# if + generations rename to +new data
if dist_1 == "+ generations":
dist_1 = "+ new data"
if dist_2 == "+ generations":
dist_2 = "+ new data"
# if + best generations rename to +best scores
if dist_1 == "+ best generations":
dist_1 = "+ best scores"
if dist_2 == "+ best generations":
dist_2 = "+ best scores"
bins = np.linspace(0, 1, num_bins) # num_bins is the number of bins you want
sns.histplot(scores_1, color=col1, label=dist_1, kde=False, stat='probability', bins=bins, alpha=1.0)
plt.ylim(0, ylim)
sns.histplot(scores_2, color=col2, label=dist_2, kde=False, stat='probability', bins=bins, alpha=0.7)
plt.ylim(0,ylim)
plt.ylabel("Frequency")
plt.title(f"Scores [worst {int(worst_percent*100)}%]")
if worst_percent <= 0.01:
loc='upper right'
else:
loc='upper left'
plt.legend(loc=loc)
def statistical_tests(dist_1, dist_2, worst_percent=0.1):
print(f'\n\n{dist_1} vs {dist_2}')
# print means
print(f'{dist_1} mean: {np.mean(get_scores(dist_1, worst_percent))}')
print(f'{dist_2} mean: {np.mean(get_scores(dist_2, worst_percent))}')
scores_1 = get_scores(dist_1, worst_percent)
scores_2 = get_scores(dist_2, worst_percent)
print(stats.ranksums(scores_1, scores_2))
print(stats.ttest_ind(scores_1, scores_2, equal_var=False))
if __name__ == '__main__':
sns.set_theme("paper", style="white", font_scale=3.0, palette='pastel')
plt.figure(figsize=(24, 14))
sns.set_style({'font.family': 'Times New Roman'})
plt.subplot(2, 3, 1)
plot_hist_two_samples("SFT", "+ generations", 0.01, 10, 0.5)
plt.grid(linestyle='dotted', axis='y')
plt.subplot(2, 3, 2)
plot_hist_two_samples("SFT", "+ best generations", 0.01, 10, 0.5)
plt.grid(linestyle='dotted', axis='y')
plt.subplot(2, 3, 3)
plot_hist_two_samples("SFT", "Self-perfecting", 0.01, 10, 0.5)
plt.grid(linestyle='dotted', axis='y')
plt.subplot(2, 3, 4)
plot_hist_two_samples("SFT", "+ generations", 0.05)
plt.grid(linestyle='dotted', axis='y')
plt.subplot(2, 3, 5)
plot_hist_two_samples("SFT", "+ best generations", 0.05)
plt.grid(linestyle='dotted', axis='y')
plt.subplot(2, 3, 6)
plot_hist_two_samples("SFT", "Self-perfecting", 0.05)
plt.grid(linestyle='dotted', axis='y')
ax = plt.gca()
ax.set_axisbelow(True)
sns.move_legend(ax, "upper left")
plt.savefig("scores_overall.pdf")
plt.show()
statistical_tests("SFT", "+ generations", worst_percent=0.01)
statistical_tests("SFT", "+ best generations", worst_percent=0.01)
statistical_tests("SFT", "Self-perfecting", worst_percent=0.01)
statistical_tests("SFT", "+ generations", worst_percent=0.05)
statistical_tests("SFT", "+ best generations", worst_percent=0.05)
statistical_tests("SFT", "Self-perfecting", worst_percent=0.05)
statistical_tests("SFT", "+ generations", worst_percent=1)
statistical_tests("SFT", "+ best generations", worst_percent=1)
statistical_tests("SFT", "Self-perfecting", worst_percent=1)